aarch64: Add support for unpacked SVE FP conversions

author Spencer Abson <spencer.abson@arm.com>

Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)

committer Spencer Abson <spencer.abson@arm.com>

Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)
author Spencer Abson <spencer.abson@arm.com>
Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)
committer Spencer Abson <spencer.abson@arm.com>
Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index b1ce42fc39667dbcf3375ecab32cff5a7bc4b971..40088db2b4230602c2d578856372985e765008ec 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -947,6 +947,7 @@ bool aarch64_parallel_select_half_p (machine_mode, rtx);
  bool aarch64_pars_overlap_p (rtx, rtx);
  bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
  bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
+bool aarch64_sve_valid_pred_p (rtx, machine_mode);
  bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
  bool aarch64_simd_valid_and_imm (rtx);
  bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL);
@@ -1028,6 +1029,8 @@ rtx aarch64_ptrue_reg (machine_mode, unsigned int);
  rtx aarch64_ptrue_reg (machine_mode, machine_mode);
  rtx aarch64_pfalse_reg (machine_mode);
  bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
+rtx aarch64_sve_packed_pred (machine_mode);
+rtx aarch64_sve_fp_pred (machine_mode, rtx *);
  void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
  bool aarch64_expand_maskloadstore (rtx *, machine_mode);
  void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 180c6dd9e5b5873a4ba0e32064df64ccd3dbff1f..450975dd088e16f16324cdb18d9cef24f77f8f7c 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -154,8 +154,10 @@
  ;; ---- [FP<-INT] Packs
  ;; ---- [FP<-INT] Unpacks
  ;; ---- [FP<-FP] Packs
+;; ---- [FP<-FP] Truncating conversions
  ;; ---- [FP<-FP] Packs (bfloat16)
  ;; ---- [FP<-FP] Unpacks
+;; ---- [FP<-FP] Extending conversions
  ;; ---- [PRED<-PRED] Packs
  ;; ---- [PRED<-PRED] Unpacks
  ;;
@@ -9524,18 +9526,37 @@
  ;; - FCVTZU
  ;; -------------------------------------------------------------------------
  
-;; Unpredicated conversion of floats to integers of the same size (HF to HI,
-;; SF to SI or DF to DI).
-(define_expand "<optab><mode><v_int_equiv>2"
-  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
-       (unspec:<V_INT_EQUIV>
+;; Unpredicated conversion of floats to integers of the same size or wider,
+;; excluding conversions from DF (see below).
+(define_expand "<optab><SVE_HSF:mode><SVE_HSDI:mode>2"
+  [(set (match_operand:SVE_HSDI 0 "register_operand")
+       (unspec:SVE_HSDI
+         [(match_dup 2)
+          (match_dup 3)
+          (match_operand:SVE_HSF 1 "register_operand")]
+         SVE_COND_FCVTI))]
+  "TARGET_SVE
+   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_HSF:self_mask>) == 0"
+  {
+    operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]);
+  }
+)
+
+;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this
+;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization.
+;;
+;; DF is the only source mode for which the mask used above doesn't apply,
+;; we define a separate pattern for it here.
+(define_expand "<optab><VNx2DF_ONLY:mode><SVE_2SDI:mode>2"
+  [(set (match_operand:SVE_2SDI 0 "register_operand")
+       (unspec:SVE_2SDI
           [(match_dup 2)
            (const_int SVE_RELAXED_GP)
-          (match_operand:SVE_FULL_F 1 "register_operand")]
+          (match_operand:VNx2DF_ONLY 1 "register_operand")]
           SVE_COND_FCVTI))]
    "TARGET_SVE"
    {
-    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[2] = aarch64_ptrue_reg (VNx2BImode);
    }
  )
  
@@ -9554,18 +9575,37 @@
    }
  )
  
-;; Predicated narrowing float-to-integer conversion.
-(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
-  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
-       (unspec:VNx4SI_ONLY
+;; As above, for pairs used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>"
+  [(set (match_operand:SVE_HSDI 0 "register_operand")
+       (unspec:SVE_HSDI
+         [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand")
+          (match_operand:SI 3 "aarch64_sve_gp_strictness")
+          (match_operand:SVE_PARTIAL_F 2 "register_operand")]
+         SVE_COND_FCVTI))]
+   "TARGET_SVE
+   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
+     [ w        , Upl , 0 ; *              ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+  }
+)
+
+;; Predicated narrowing float-to-integer conversion.  The VNx2DF->VNx4SI
+;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are
+;; significant.  The VNx2DF->VNx2SI variant is provided for auto-vectorization,
+;; where the upper 32 bits of each container are ignored.
+(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><SVE_SI:mode>"
+  [(set (match_operand:SVE_SI 0 "register_operand")
+       (unspec:SVE_SI
           [(match_operand:VNx2BI 1 "register_operand")
            (match_operand:SI 3 "aarch64_sve_gp_strictness")
            (match_operand:VNx2DF_ONLY 2 "register_operand")]
           SVE_COND_FCVTI))]
    "TARGET_SVE"
    {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
-     [ w        , Upl , 0 ; *              ] fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
-     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+     [ w        , Upl , 0 ; *              ] fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
    }
  )
  
@@ -9710,18 +9750,19 @@
  ;; - UCVTF
  ;; -------------------------------------------------------------------------
  
-;; Unpredicated conversion of integers to floats of the same size
-;; (HI to HF, SI to SF or DI to DF).
-(define_expand "<optab><v_int_equiv><mode>2"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-       (unspec:SVE_FULL_F
+;; Unpredicated conversion of integers to floats of the same size or
+;; narrower.
+(define_expand "<optab><SVE_HSDI:mode><SVE_F:mode>2"
+  [(set (match_operand:SVE_F 0 "register_operand")
+       (unspec:SVE_F
           [(match_dup 2)
-          (const_int SVE_RELAXED_GP)
-          (match_operand:<V_INT_EQUIV> 1 "register_operand")]
+          (match_dup 3)
+          (match_operand:SVE_HSDI 1 "register_operand")]
           SVE_COND_ICVTF))]
-  "TARGET_SVE"
+  "TARGET_SVE
+   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_F:self_mask>) == 0"
    {
-    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]);
    }
  )
  
@@ -9741,6 +9782,22 @@
    }
  )
  
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>"
+  [(set (match_operand:SVE_PARTIAL_F 0 "register_operand")
+       (unspec:SVE_PARTIAL_F
+         [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand")
+          (match_operand:SI 3 "aarch64_sve_gp_strictness")
+          (match_operand:SVE_HSDI 2 "register_operand")]
+         SVE_COND_ICVTF))]
+  "TARGET_SVE
+   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
+     [ w        , Upl , 0 ; *              ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+  }
+)
+
  ;; Predicated widening integer-to-float conversion.
  (define_insn "@aarch64_sve_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
    [(set (match_operand:VNx2DF_ONLY 0 "register_operand")
@@ -9924,6 +9981,27 @@
    }
  )
  
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Truncating conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated float-to-float truncation.
+(define_expand "trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>2"
+  [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+       (unspec:SVE_PARTIAL_HSF
+         [(match_dup 2)
+          (match_dup 3)
+          (match_operand:SVE_SDF 1 "register_operand")]
+         SVE_COND_FCVT))]
+  "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+  {
+    operands[2] = aarch64_sve_fp_pred (<SVE_SDF:MODE>mode, &operands[3]);
+  }
+)
+
  ;; Predicated float-to-float truncation.
  (define_insn "@aarch64_sve_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
    [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
@@ -9939,6 +10017,21 @@
    }
  )
  
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>"
+  [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+       (unspec:SVE_PARTIAL_HSF
+         [(match_operand:<SVE_SDF:VPRED> 1 "aarch64_predicate_operand")
+          (match_operand:SI 3 "aarch64_sve_gp_strictness")
+          (match_operand:SVE_SDF 2 "register_operand")]
+         SVE_COND_FCVT))]
+  "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
+     [ w        , Upl , 0 ; *              ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+  }
+)
+
  ;; Predicated float-to-float truncation with merging.
  (define_expand "@cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
    [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
@@ -10081,6 +10174,27 @@
    }
  )
  
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Extending conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated float-to-float extension.
+(define_expand "extend<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>2"
+  [(set (match_operand:SVE_SDF 0 "register_operand")
+       (unspec:SVE_SDF
+         [(match_dup 2)
+          (match_dup 3)
+          (match_operand:SVE_PARTIAL_HSF 1 "register_operand")]
+         SVE_COND_FCVT))]
+  "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+  {
+    operands[2] = aarch64_sve_fp_pred (<SVE_SDF:MODE>mode, &operands[3]);
+  }
+)
+
  ;; Predicated float-to-float extension.
  (define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
    [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
@@ -10096,6 +10210,21 @@
    }
  )
  
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>"
+  [(set (match_operand:SVE_SDF 0 "register_operand")
+       (unspec:SVE_SDF
+         [(match_operand:<SVE_SDF:VPRED> 1 "aarch64_predicate_operand")
+          (match_operand:SI 3 "aarch64_sve_gp_strictness")
+          (match_operand:SVE_PARTIAL_HSF 2 "register_operand")]
+         SVE_COND_FCVT))]
+  "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
+     [ w        , Upl , 0 ; *              ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+  }
+)
+
  ;; Predicated float-to-float extension with merging.
  (define_expand "@cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
    [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index a65552a062f94fde3ec8042755878d2861b94510..5540946eac718c27c4f669a3f636bcd10b301b9f 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3860,6 +3860,44 @@ aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
    return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
  }
  
+
+/* Generate a predicate to control partial SVE mode DATA_MODE as if it
+   were fully packed, enabling the defined elements only.  */
+rtx
+aarch64_sve_packed_pred (machine_mode data_mode)
+{
+  unsigned int container_bytes
+    = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT;
+  /* Enable the significand of each container only.  */
+  rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes));
+  /* Predicate at the element size.  */
+  machine_mode pmode
+    = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require ();
+  return gen_lowpart (pmode, ptrue);
+}
+
+/* Generate a predicate and strictness value to govern a floating-point
+   operation with SVE mode DATA_MODE.
+
+   If DATA_MODE is a partial vector mode, this pair prevents the operation
+   from interpreting undefined elements - unless we don't need to suppress
+   their trapping behavior.  */
+rtx
+aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
+{
+   unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
+   if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
+     {
+       if (strictness)
+        *strictness = gen_int_mode (SVE_STRICT_GP, SImode);
+       return aarch64_sve_packed_pred (data_mode);
+     }
+   if (strictness)
+     *strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
+   /* Use the VPRED mode.  */
+   return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
+}
+
  /* Emit a comparison CMP between OP0 and OP1, both of which have mode
     DATA_MODE, and return the result in a predicate of mode PRED_MODE.
     Use TARGET as the target register if nonnull and convenient.  */
@@ -23697,6 +23735,19 @@ aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
      return IN_RANGE (INTVAL (x), 1, bit_width);
  }
  
+
+/* Check whether X can control SVE mode MODE.  */
+bool
+aarch64_sve_valid_pred_p (rtx x, machine_mode mode)
+{
+  machine_mode pred_mode = GET_MODE (x);
+  if (!aarch64_sve_pred_mode_p (pred_mode))
+    return false;
+
+  return known_ge (GET_MODE_NUNITS (pred_mode),
+                  GET_MODE_NUNITS (mode));
+}
+
  /* Return the bitmask CONST_INT to select the bits required by a zero extract
     operation of width WIDTH at bit position POS.  */
  
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index d629538775178df4daeb6cead29535a2e4eb98af..2700392db5fae2b0b269f017a3574366b6549838 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -541,6 +541,13 @@
  ;; elements.
  (define_mode_iterator SVE_FULL_HSF [VNx8HF VNx4SF])
  
+;; Partial SVE floating-point vector modes that have 16-bit or 32-bit
+;; elements.
+(define_mode_iterator SVE_PARTIAL_HSF [VNx2HF VNx4HF VNx2SF])
+
+;; SVE floating-point vector modes that have 16-bit or 32-bit elements.
+(define_mode_iterator SVE_HSF [SVE_PARTIAL_HSF SVE_FULL_HSF])
+
  ;; Fully-packed SVE integer vector modes that have 16-bit or 64-bit elements.
  (define_mode_iterator SVE_FULL_HDI [VNx8HI VNx2DI])
  
@@ -565,6 +572,9 @@
  (define_mode_iterator SVE_MATMULF [(VNx4SF "TARGET_SVE_F32MM")
                                    (VNx2DF "TARGET_SVE_F64MM")])
  
+;; SVE floating-point vector modes that have 32-bit or 64-bit elements.
+(define_mode_iterator SVE_SDF [VNx2SF SVE_FULL_SDF])
+
  ;; Fully-packed SVE vector modes that have 32-bit or smaller elements.
  (define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
                                     VNx8BF VNx8HF VNx4SF])
@@ -634,6 +644,9 @@
                                 VNx4SI VNx2SI
                                 VNx2DI])
  
+;; SVE integer vector modes with 32-bit elements.
+(define_mode_iterator SVE_SI [VNx2SI VNx4SI])
+
  (define_mode_iterator SVE_DIx24 [VNx4DI VNx8DI])
  
  ;; SVE modes with 2 or 4 elements.
@@ -649,6 +662,9 @@
  (define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
                              VNx2SI VNx2SF VNx2DI VNx2DF])
  
+;; SVE SI and DI modes with 2 elements.
+(define_mode_iterator SVE_2SDI [VNx2SI VNx2DI])
+
  ;; SVE integer modes with 2 elements, excluding the widest element.
  (define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
  
@@ -2596,19 +2612,22 @@
  (define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
                               (VNx4BI "4") (VNx2BI "8")])
  
-;; Two-nybble mask for partial vector modes: nunits, byte size.
-(define_mode_attr self_mask [(VNx8QI "0x81")
-                            (VNx4QI "0x41")
-                            (VNx2QI "0x21")
-                            (VNx4HI "0x42")
-                            (VNx2HI "0x22")
-                            (VNx2SI "0x24")])
-
-;; For SVE_HSDI vector modes, the mask of narrower modes, encoded as above.
-(define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
-                                (VNx2HI "0x21")
-                                (VNx4SI "0x43") (VNx2SI "0x23")
-                                (VNx2DI "0x27")])
+;; Two-nybble mask for vector modes: nunits, byte size.
+(define_mode_attr self_mask [(VNx2HI "0x22") (VNx2HF "0x22")
+                            (VNx4HI "0x42") (VNx4HF "0x42")
+                            (VNx8HI "0x82") (VNx8HF "0x82")
+                            (VNx2SI "0x24") (VNx2SF "0x24")
+                            (VNx4SI "0x44") (VNx4SF "0x44")
+                            (VNx2DI "0x28") (VNx2DF "0x28")
+                            (VNx8QI "0x81") (VNx4QI "0x41") (VNx2QI "0x21")])
+
+;; The mask of narrower vector modes, encoded as above.
+(define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx8HF "0x81")
+                                (VNx4HI "0x41") (VNx4HF "0x41")
+                                (VNx2HI "0x21") (VNx2HF "0x21")
+                                (VNx4SI "0x43") (VNx4SF "0x43")
+                                (VNx2SI "0x23") (VNx2SF "0x23")
+                                (VNx2DI "0x27") (VNx2DF "0x27")])
  
  ;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
  (define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md

index 2c6af831eae144a6f0f60bd59411c2f4e6209297..d8e9725a1b65b9a56fa297c966fdac32d102f782 100644 (file)
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -587,6 +587,11 @@
    return aarch64_simd_shift_imm_p (op, mode, false);
  })
  
+(define_special_predicate "aarch64_predicate_operand"
+  (and (match_code "reg,subreg")
+       (match_test "register_operand (op, GET_MODE (op))")
+       (match_test "aarch64_sve_valid_pred_p (op, mode)")))
+
  (define_predicate "aarch64_simd_imm_zero"
    (and (match_code "const,const_vector")
         (match_test "op == CONST0_RTX (GET_MODE (op))")))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_signed_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_signed_1.c

index 367fbd967a3e912a6d8315e2ad2cee2b957a474b..5c76cbd88da78ee04f683627908c6d1412797b0e 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_signed_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_signed_1.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
  
  #include <stdint.h>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c

index c5da480c9932e062707a76e2847dc1440d5dcc6a..5e3881a895ec345dd17c68d7a49a4024adc6e87e 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
  
  #include <stdint.h>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pack_float_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pack_float_1.c

index 2683a87f4ff4d925d310e85431f650232d46195b..4810df88e407d344f091a221bf6e22a310f53645 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/pack_float_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pack_float_1.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
  
  void __attribute__ ((noinline, noclone))
  pack_float_plus_1point1 (float *d, double *s, int size)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c

index deb4cf5e940b186fb11468fea27b697cc2737010..d1e74634ece84005e3e4d979b761fd2111298028 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize  --param aarch64-vect-compare-costs=0" } */
  
  void __attribute__ ((noinline, noclone))
  unpack_float_plus_7point9 (double *d, float *s, int size)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c

new file mode 100644 (file)

index 0000000..76baffa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c
@@ -0,0 +1,217 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+typedef int16_t v32hi __attribute__((vector_size(64)));
+typedef int16_t v64hi __attribute__((vector_size(128)));
+typedef uint16_t v32uhi __attribute__((vector_size(64)));
+typedef uint16_t v64uhi __attribute__((vector_size(128)));
+
+typedef int32_t v32si __attribute__((vector_size(128)));
+typedef int32_t v64si __attribute__((vector_size(256)));
+typedef uint32_t v32usi __attribute__((vector_size(128)));
+typedef uint32_t v64usi __attribute__((vector_size(256)));
+
+typedef int64_t v32di __attribute__((vector_size(256)));
+typedef uint64_t v32udi __attribute__((vector_size(256)));
+
+/*
+** float_2hf2hi:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     scvtf   (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v32hf
+float_2hf2hi (v32hi x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2uhi:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     ucvtf   (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v32hf
+float_2hf2uhi (v32uhi x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2si:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     scvtf   (z[0-9]+)\.h, \2/m, \1\.s
+**     ...
+*/
+v32hf
+float_2hf2si (v32si x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2usi:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     ucvtf   (z[0-9]+)\.h, \2/m, \1\.s
+**     ...
+*/
+v32hf
+float_2hf2usi (v32usi x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2di:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     scvtf   (z[0-9]+)\.h, \1/m, \2\.d
+**     ...
+*/
+v32hf
+float_2hf2di (v32di x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2udi:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ucvtf   (z[0-9]+)\.h, \1/m, \2\.d
+**     ...
+*/
+v32hf
+float_2hf2udi (v32udi x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_4hf4hi:
+**     ...
+**     ld1h    (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.s, vl64
+**     scvtf   (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v64hf
+float_4hf4hi (v64hi x)
+{
+  return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4uhi:
+**     ...
+**     ld1h    (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.s, vl64
+**     ucvtf   (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v64hf
+float_4hf4uhi (v64uhi x)
+{
+  return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4si:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.s, \1/z, \[x0\]
+**     scvtf   (z[0-9]+)\.h, \1/m, \2\.s
+**     ...
+*/
+v64hf
+float_4hf4si (v64si x)
+{
+  return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4usi:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ucvtf   (z[0-9]+)\.h, \1/m, \2\.s
+**     ...
+*/
+v64hf
+float_4hf4usi (v64usi x)
+{
+  return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_2sf2si:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     scvtf   (z[0-9]+)\.s, \2/m, \1\.s
+**     ...
+*/
+v32sf
+float_2sf2si (v32si x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2usi:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     ucvtf   (z[0-9]+)\.s, \2/m, \1\.s
+**     ...
+*/
+v32sf
+float_2sf2usi (v32usi x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2di:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     scvtf   (z[0-9]+)\.s, \1/m, \2\.d
+**     ...
+*/
+v32sf
+float_2sf2di (v32di x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2udi:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ucvtf   (z[0-9]+)\.s, \1/m, \2\.d
+**     ...
+*/
+v32sf
+float_2sf2udi (v32udi x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_2.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_2.c

new file mode 100644 (file)

index 0000000..f578bcf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_cvtf_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c

new file mode 100644 (file)

index 0000000..6324bdd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void f64_i32 (double *restrict x, int32_t  *restrict y, int n)
+{
+  for (int i = 0; i < n; i++)
+    x[i] = (double)y[i];
+}
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.[sd], p[0-7]/m, z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_1.c

new file mode 100644 (file)

index 0000000..0babf15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_1.c
@@ -0,0 +1,118 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+/*
+** trunc_2sf2df:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.s, \1/m, \2\.d
+**     ...
+*/
+v32sf
+trunc_2sf2df (v32df x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** trunc_2hf2df:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.h, \1/m, \2\.d
+**     ...
+*/
+v32hf
+trunc_2hf2df (v32df x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** trunc_4hf4sf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.s, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.h, \1/m, \2\.s
+**     ...
+*/
+v64hf
+trunc_4hf4sf (v64sf x)
+{
+  return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** trunc_2hf2sf:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvt    (z[0-9]+)\.h, \2/m, \1\.s
+**     ...
+*/
+v32hf
+trunc_2hf2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** extend_2df2hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.d, \1/m, \2\.h
+**     ...
+*/
+v32df
+extend_2df2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32df);
+}
+
+/*
+** extend_2df2sf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.d, \1/m, \2\.s
+**     ...
+*/
+v32df
+extend_2df2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32df);
+}
+
+/*
+** extend_4sf4hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     fcvt    (z[0-9]+)\.s, \1/m, \2\.h
+**     ...
+*/
+v64sf
+extend_4sf4hf (v64hf x)
+{
+  return __builtin_convertvector (x, v64sf);
+}
+
+/*
+** extend_2sf2hf:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvt    (z[0-9]+)\.s, \2/m, \1\.h
+**     ...
+*/
+v32sf
+extend_2sf2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32sf);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_2.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_2.c

new file mode 100644 (file)

index 0000000..8c369ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_fcvt_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_1.c

new file mode 100644 (file)

index 0000000..773a3dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_1.c
@@ -0,0 +1,244 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+typedef int16_t v32hi __attribute__((vector_size(64)));
+typedef int16_t v64hi __attribute__((vector_size(128)));
+typedef uint16_t v32uhi __attribute__((vector_size(64)));
+typedef uint16_t v64uhi __attribute__((vector_size(128)));
+
+typedef int32_t v32si __attribute__((vector_size(128)));
+typedef int32_t v64si __attribute__((vector_size(256)));
+typedef uint32_t v32usi __attribute__((vector_size(128)));
+typedef uint32_t v64usi __attribute__((vector_size(256)));
+
+typedef int64_t v32di __attribute__((vector_size(256)));
+typedef uint64_t v32udi __attribute__((vector_size(256)));
+
+
+/*
+** fix_trunc_2hi2hf:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzs  (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v32hi
+fix_trunc_2hi2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32hi);
+}
+
+/*
+** fix_trunc_2uhi2hf:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzu  (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v32uhi
+fix_trunc_2uhi2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32uhi);
+}
+
+/*
+** fix_trunc_2si2hf:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzs  (z[0-9]+)\.s, \2/m, \1\.h
+**     ...
+*/
+v32si
+fix_trunc_2si2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2hf:
+**     ...
+**     ld1h    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzu  (z[0-9]+)\.s, \2/m, \1\.h
+**     ...
+*/
+v32usi
+fix_trunc_2usi2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32usi);
+}
+
+/*
+** fix_trunc_2di2hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzs  (z[0-9]+)\.d, \1/m, \2\.h
+**     ...
+*/
+v32di
+fix_trunc_2di2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32di);
+}
+
+/*
+** fix_trunc_2udi2hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzu  (z[0-9]+)\.d, \1/m, \2\.h
+**     ...
+*/
+v32udi
+fix_trunc_2udi2hf (v32hf x)
+{
+  return __builtin_convertvector (x, v32udi);
+}
+
+/*
+** fix_trunc_4hi4hf:
+**     ...
+**     ld1h    (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.s, vl64
+**     fcvtzs  (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v64hi
+fix_trunc_4hi4hf (v64hf x)
+{
+  return __builtin_convertvector (x, v64hi);
+}
+
+/*
+** fix_trunc_4uhi4hf:
+**     ...
+**     ld1h    (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.s, vl64
+**     fcvtzu  (z[0-9]+)\.h, \2/m, \1\.h
+**     ...
+*/
+v64uhi
+fix_trunc_4uhi4hf (v64hf x)
+{
+  return __builtin_convertvector (x, v64uhi);
+}
+
+/*
+** fix_trunc_4si4hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     fcvtzs  (z[0-9]+)\.s, \1/m, \2\.h
+**     ...
+*/
+v64si
+fix_trunc_4si4hf (v64hf x)
+{
+  return __builtin_convertvector (x, v64si);
+}
+
+/*
+** fix_trunc_4usi4hf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     fcvtzu  (z[0-9]+)\.s, \1/m, \2\.h
+**     ...
+*/
+v64usi
+fix_trunc_4usi4hf (v64hf x)
+{
+  return __builtin_convertvector (x, v64usi);
+}
+
+/*
+** fix_trunc_2si2sf:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzs  (z[0-9]+)\.s, \2/m, \1\.s
+**     ...
+*/
+v32si
+fix_trunc_2si2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2sf:
+**     ...
+**     ld1w    (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+**     ptrue   (p[0-7])\.d, vl32
+**     fcvtzu  (z[0-9]+)\.s, \2/m, \1\.s
+**     ...
+*/
+v32usi
+fix_trunc_2usi2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32usi);
+}
+
+/*
+** fix_trunc_2di2sf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzs  (z[0-9]+)\.d, \1/m, \2\.s
+**     ...
+*/
+v32di
+fix_trunc_2di2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32di);
+}
+
+/*
+** fix_trunc_2udi2sf:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzu  (z[0-9]+)\.d, \1/m, \2\.s
+**     ...
+*/
+v32udi
+fix_trunc_2udi2sf (v32sf x)
+{
+  return __builtin_convertvector (x, v32udi);
+}
+
+/*
+** fix_trunc_2si2df:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzs  (z[0-9]+)\.s, \1/m, \2\.d
+**     ...
+*/
+v32si
+fix_trunc_2si2df (v32df x)
+{
+  return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2df:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1d    (z[0-9]+)\.d, \1/z, \[x0\]
+**     fcvtzu  (z[0-9]+)\.s, \1/m, \2\.d
+**     ...
+*/
+v32usi
+fix_trunc_2usi2df (v32df x)
+{
+  return __builtin_convertvector (x, v32usi);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_2.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_2.c

new file mode 100644 (file)

index 0000000..0587753
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_2.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_fcvtz_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
author	Spencer Abson <spencer.abson@arm.com>
	Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)
committer	Spencer Abson <spencer.abson@arm.com>
	Mon, 16 Jun 2025 19:31:30 +0000 (19:31 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/iterators.md		patch \| blob \| blame \| history
gcc/config/aarch64/predicates.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_signed_1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/pack_float_1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvt_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcvtz_2.c	[new file with mode: 0644]	patch \| blob