;;
;; [vhaddq_s, vhaddq_u])
;;
-(define_insn "mve_vhaddq_<supf><mode>"
+(define_insn "@mve_vhaddq_<supf><mode>"
[
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
;;
;; [vrhaddq_s, vrhaddq_u])
;;
-(define_insn "mve_vrhaddq_<supf><mode>"
+(define_insn "@mve_vrhaddq_<supf><mode>"
[
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
; vhadd and vrhadd.
-(define_insn "neon_v<r>hadd<sup><mode>"
+(define_insn "@neon_v<r>hadd<sup><mode>"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
DONE;
})
+
+(define_expand "avg<mode>3_floor"
+ [(match_operand:MVE_2 0 "s_register_operand")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")]
+ "ARM_HAVE_<MODE>_ARITH"
+{
+ if (TARGET_HAVE_MVE)
+ emit_insn (gen_mve_vhaddq (VHADDQ_S, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ else
+ emit_insn (gen_neon_vhadd (UNSPEC_VHADD_S, UNSPEC_VHADD_S, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "uavg<mode>3_floor"
+ [(match_operand:MVE_2 0 "s_register_operand")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")]
+ "ARM_HAVE_<MODE>_ARITH"
+{
+ if (TARGET_HAVE_MVE)
+ emit_insn (gen_mve_vhaddq (VHADDQ_U, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ else
+ emit_insn (gen_neon_vhadd (UNSPEC_VHADD_U, UNSPEC_VHADD_U, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "avg<mode>3_ceil"
+ [(match_operand:MVE_2 0 "s_register_operand")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")]
+ "ARM_HAVE_<MODE>_ARITH"
+{
+ if (TARGET_HAVE_MVE)
+ emit_insn (gen_mve_vrhaddq (VRHADDQ_S, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ else
+ emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_S, UNSPEC_VRHADD_S, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "uavg<mode>3_ceil"
+ [(match_operand:MVE_2 0 "s_register_operand")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")]
+ "ARM_HAVE_<MODE>_ARITH"
+{
+ if (TARGET_HAVE_MVE)
+ emit_insn (gen_mve_vrhaddq (VRHADDQ_U, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ else
+ emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_U, UNSPEC_VRHADD_U, <MODE>mode,
+ operands[0], operands[1], operands[2]));
+ DONE;
+})
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+/* We force a cast to int64_t to enable the vectorizer when dealing with 32-bit
+ inputs. */
+#define FUNC(SIGN, TYPE, BITS, OP, NAME) \
+ void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \
+ TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+ int i; \
+ for (i=0; i < (128 / BITS); i++) { \
+ dest[i] = ((int64_t)a[i] OP b[i]) >> 1; \
+ } \
+}
+
+FUNC(s, int, 32, +, vhadd)
+FUNC(u, uint, 32, +, vhadd)
+FUNC(s, int, 16, +, vhadd)
+FUNC(u, uint, 16, +, vhadd)
+FUNC(s, int, 8, +, vhadd)
+FUNC(u, uint, 8, +, vhadd)
+
+/* { dg-final { scan-assembler-times {vhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+/* We force a cast to int64_t to enable the vectorizer when dealing with 32-bit
+ inputs. */
+#define FUNC(SIGN, TYPE, BITS, OP, NAME) \
+ void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \
+ TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+ int i; \
+ for (i=0; i < (128 / BITS); i++) { \
+ dest[i] = ((int64_t)a[i] OP b[i] + 1) >> 1; \
+ } \
+}
+
+FUNC(s, int, 32, +, vrhadd)
+FUNC(u, uint, 32, +, vrhadd)
+FUNC(s, int, 16, +, vrhadd)
+FUNC(u, uint, 16, +, vrhadd)
+FUNC(s, int, 8, +, vrhadd)
+FUNC(u, uint, 8, +, vrhadd)
+
+/* { dg-final { scan-assembler-times {vrhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+/* Since we have implemented the avg* optabs for 128-bit vectors only, use
+ enough iterations to check that vectorization works as expected. */
+
+/* We force a cast to int64_t to enable the vectorizer when dealing with 32-bit
+ inputs. */
+#define FUNC(SIGN, TYPE, BITS, OP, NAME) \
+ void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \
+ TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+ int i; \
+ for (i=0; i < (128 / BITS); i++) { \
+ dest[i] = ((int64_t)a[i] OP b[i]) >> 1; \
+ } \
+}
+
+FUNC(s, int, 32, +, vhadd)
+FUNC(u, uint, 32, +, vhadd)
+FUNC(s, int, 16, +, vhadd)
+FUNC(u, uint, 16, +, vhadd)
+FUNC(s, int, 8, +, vhadd)
+FUNC(u, uint, 8, +, vhadd)
+
+/* { dg-final { scan-assembler-times {vhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+/* Since we default to -mvectorize-with-neon-quad, use enough iterations so that
+ we can vectorize using 128-bit vectors. */
+/* We force a cast to int64_t to enable the vectorizer when dealing with 32-bit
+ inputs. */
+#define FUNC(SIGN, TYPE, BITS, OP, NAME) \
+ void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \
+ TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+ int i; \
+ for (i=0; i < (128 / BITS); i++) { \
+ dest[i] = ((int64_t)a[i] OP b[i] + 1) >> 1; \
+ } \
+}
+
+FUNC(s, int, 32, +, vrhadd)
+FUNC(u, uint, 32, +, vrhadd)
+FUNC(s, int, 16, +, vrhadd)
+FUNC(u, uint, 16, +, vrhadd)
+FUNC(s, int, 8, +, vrhadd)
+FUNC(u, uint, 8, +, vrhadd)
+
+/* { dg-final { scan-assembler-times {vrhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vrhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */