void aarch64_expand_mov_immediate (rtx, rtx);
rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
rtx aarch64_ptrue_reg (machine_mode);
+rtx aarch64_ptrue_reg (machine_mode, unsigned int);
rtx aarch64_pfalse_reg (machine_mode);
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
)
(define_expand "popcount<mode>2"
- [(set (match_operand:VDQHSD 0 "register_operand")
- (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+ [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+ (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
"TARGET_SIMD"
{
if (TARGET_SVE)
{
- rtx p = aarch64_ptrue_reg (<VPRED>mode);
+ rtx p = aarch64_ptrue_reg (<VPRED>mode, <bitsize> == 64 ? 8 : 16);
emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
p,
operands[1]));
DONE;
}
+ if (<MODE>mode == V1DImode)
+ {
+ rtx out = gen_reg_rtx (DImode);
+ emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out));
+ DONE;
+ }
+
/* Generate a byte popcount. */
machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
machine_mode mode2 = <bitsize> == 64 ? V2SImode : V4SImode;
return gen_lowpart (mode, reg);
}
+/* Return an all-true (restricted to the leading VL bits) predicate register of
+ mode MODE. */
+
+rtx
+aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
+{
+ gcc_assert (aarch64_sve_pred_mode_p (mode));
+
+ rtx_vector_builder builder (VNx16BImode, vl, 2);
+
+ for (int i = 0; i < vl; i++)
+ builder.quick_push (CONST1_RTX (BImode));
+
+ for (int i = 0; i < vl; i++)
+ builder.quick_push (CONST0_RTX (BImode));
+
+ rtx const_vec = builder.build ();
+ rtx reg = force_reg (VNx16BImode, const_vec);
+ return gen_lowpart (mode, reg);
+}
+
/* Return an all-false predicate register of mode MODE. */
rtx
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
"TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD"
{
+ if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode)
+ {
+ rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode);
+ rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]);
+ emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
+ DONE;
+ }
+
if (!TARGET_CSSC)
{
rtx v = gen_reg_rtx (V8QImode);
;; Advanced SIMD modes for H, S and D types.
(define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
+(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
+
;; Advanced SIMD and scalar integer modes for H and S.
(define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
;; All SVE and Advanced SIMD integer vector modes.
-(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])
;; SVE integer vector modes whose elements are 16 bits or wider.
(define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
(define_mode_attr bitsize [(V8QI "64") (V16QI "128")
(V4HI "64") (V8HI "128")
(V2SI "64") (V4SI "128")
- (V2DI "128")])
+ (V1DI "64") (V2DI "128")])
;; Map a floating point or integer mode to the appropriate register name prefix
(define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
(V8QI "VNx8BI") (V16QI "VNx16BI")
(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
- (V4SI "VNx4BI") (V2DI "VNx2BI")])
+ (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
;; ...and again in lower case.
(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
+;; The Advanced SIMD modes of popcount corresponding to scalar modes.
+(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
+ (SI "V2SI") (DI "V1DI")])
+
+;; ...and again in lower case.
+(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
+ (SI "v2si") (DI "v1di")])
+
;; On AArch64 the By element instruction doesn't have a 2S variant.
;; However because the instruction always selects a pair of values
;; The normal 3SAME instruction can be used here instead.
/*
** f_v4hi:
-** ptrue (p[0-7]).b, all
+** ptrue (p[0-7]).b, vl8
** ldr d([0-9]+), \[x0\]
** cnt z\2.h, \1/m, z\2.h
** str d\2, \[x1\]
/*
** f_v8hi:
-** ptrue (p[0-7]).b, all
+** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.h, \1/m, z\2.h
** str q\2, \[x1\]
/*
** f_v2si:
-** ptrue (p[0-7]).b, all
+** ptrue (p[0-7]).b, vl8
** ldr d([0-9]+), \[x0\]
** cnt z\2.s, \1/m, z\2.s
** str d\2, \[x1\]
/*
** f_v4si:
-** ptrue (p[0-7]).b, all
+** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.s, \1/m, z\2.s
** str q\2, \[x1\]
/*
** f_v2di:
-** ptrue (p[0-7]).b, all
+** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.d, \1/m, z\2.d
** str q\2, \[x1\]
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f_qi:
+** ldr b([0-9]+), \[x0\]
+** cnt v\1.8b, v\1.8b
+** smov w0, v\1.b\[0\]
+** ret
+*/
+unsigned
+f_qi (unsigned char *a)
+{
+ return __builtin_popcountg (a[0]);
+}
+
+/*
+** f_hi:
+** ldr h([0-9]+), \[x0\]
+** ptrue (p[0-7]).b, vl8
+** cnt z\1.h, \2/m, z\1.h
+** smov w0, v\1.h\[0\]
+** ret
+*/
+unsigned
+f_hi (unsigned short *a)
+{
+ return __builtin_popcountg (a[0]);
+}
+
+/*
+** f_si:
+** ldr s([0-9]+), \[x0\]
+** ptrue (p[0-7]).b, vl8
+** cnt z\1.s, \2/m, z\1.s
+** umov x0, v\1.d\[0\]
+** ret
+*/
+unsigned
+f_si (unsigned int *a)
+{
+ return __builtin_popcountg (a[0]);
+}
+
+/*
+** f_di:
+** ldr d([0-9]+), \[x0\]
+** ptrue (p[0-7])\.b, vl8
+** cnt z\1\.d, \2/m, z\1\.d
+** fmov x0, d\1
+** ret
+*/
+unsigned
+f_di (unsigned long *a)
+{
+ return __builtin_popcountg (a[0]);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -fgimple" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** foo:
+** cnt (v[0-9]+\.8b), v0\.8b
+** addv b0, \1
+** ret
+*/
+__Uint64x1_t __GIMPLE
+foo (__Uint64x1_t x)
+{
+ __Uint64x1_t z;
+
+ z = .POPCOUNT (x);
+ return z;
+}