AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.

author Tamar Christina <tamar.christina@arm.com>

Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)
author Tamar Christina <tamar.christina@arm.com>
Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 4648aa67e0c360076cf4444c4e0ac55babda34e6..26c08dbd92080565f0671e9173fef69321029083 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@
    }
  )
  
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations.  Instead split them early and allow fwprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+  [(match_operand:<V128> 0 "register_operand")
+   (match_operand:SVE_FULL 1 "register_operand")
+   (match_operand:SI 2 "const0_operand")]
+  "TARGET_SVE"
+{
+    emit_move_insn (operands[0],
+                   force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+    DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors.  For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+  [(match_operand:<V64> 0 "register_operand")
+   (match_operand:SVE_FULL_BHS 1 "register_operand")
+   (match_operand:SI 2 "const0_to_1_operand")]
+  "TARGET_SVE"
+{
+    if (CONST0_RTX (SImode) == operands[2])
+      emit_move_insn (operands[0],
+                     force_lowpart_subreg (<V64>mode, operands[1],
+                                           <MODE>mode));
+    else
+      {
+       rtx tmp = gen_reg_rtx (<V128>mode);
+       emit_move_insn (tmp,
+                       force_lowpart_subreg (<V128>mode, operands[1],
+                                             <MODE>mode));
+       emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+      }
+    DONE;
+})
+
  ;; Extract element zero.  This is a special case because we want to force
  ;; the registers to be the same for the second alternative, and then
  ;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 32e5009a1f96a27af47d12c919211e4fb27579a1..0c80b7adeaef5e5d4ddf9fa5eea8d5bd58ea4a3b 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1820,6 +1820,18 @@
                         (VNx4SI  "v4si") (VNx4SF "v4sf")
                         (VNx2DI  "v2di") (VNx2DF "v2df")])
  
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+                       (VNx8HI  "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+                       (VNx4SI  "V2SI") (VNx4SF "V2SF")
+                       (VNx2DI  "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+                       (VNx8HI  "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+                       (VNx4SI  "v2si") (VNx4SF "v2sf")
+                       (VNx2DI  "di") (VNx2DF "df")])
+
  (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
  
  ;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md

index 3214476497c6ac72649123b9399e73b3ff0ccbee..f53591d4b045b88f7ed1643d6b52c84fa86c3257 100644 (file)
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@
    (and (match_code "const_int")
         (match_test "op == CONST0_RTX (mode)")))
  
+(define_predicate "const0_to_1_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
  (define_predicate "const_0_to_7_operand"
    (and (match_code "const_int")
         (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c

index 3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
  /* { dg-do compile } */
  /* { dg-require-effective-target aarch64_little_endian } */
  /* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
  
  #include <arm_neon_sve_bridge.h>
  
@@ -16,6 +17,11 @@ test_addressable ()
    return vmovl_s8 (vget_high_s8 (z));
  }
  
+/*
+** test_scalable_type:
+**     sxtl2   v0.8h, v0.16b
+**     ret
+*/
  int16x8_t
  test_scalable_type (svint8_t scalable)
  {
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
    return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
  }
  
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c

new file mode 100644 (file)

index 0000000..a3d59a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+    return svset_neonq_s16(svundef_s16(),
+            vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+                      vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c

new file mode 100644 (file)

index 0000000..6cca4ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/* 
+** add_neon_i16_from_i8_low_sve_bridged:
+**     saddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+**     saddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+**     ssubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+**     uaddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+**     uaddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+**     usubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+**     usubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+**     saddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+**     saddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+**     ssubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+**     ssubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+**     uaddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+**     uaddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+**     usubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+**     usubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+**     saddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+**     saddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+**     ssubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+**     ssubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+**     uaddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+**     uaddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+**     usubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/* 
+** sub_neon_u64_from_u32_high_sve_bridged:
+**     usubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
author	Tamar Christina <tamar.christina@arm.com>
	Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Wed, 19 Nov 2025 14:27:55 +0000 (14:27 +0000)
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| blame \| history
gcc/config/aarch64/iterators.md		patch \| blob \| blame \| history
gcc/config/aarch64/predicates.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c	[new file with mode: 0644]	patch \| blob