(match_operand:TI 1 "register_operand")]
"TARGET_SIMD && !TARGET_CSSC"
{
+ /* For SVE we can do popcount on DImode chunks of the TImode argument
+ and then use a cheap ADDP reduction. The SVE CNT instruction requires
+ materializing a PTRUE so don't do this if optimizing for size. */
+ if (TARGET_SVE && !optimize_function_for_size_p (cfun))
+ {
+ rtx v = gen_reg_rtx (V2DImode);
+ rtx v1 = gen_reg_rtx (V2DImode);
+ emit_move_insn (v, gen_lowpart (V2DImode, operands[1]));
+ rtx p = aarch64_ptrue_reg (VNx2BImode, 16);
+ emit_insn (gen_aarch64_pred_popcountv2di (v1, p, v));
+ emit_insn (gen_reduc_plus_scal_v2di (operands[0], v1));
+ DONE;
+ }
rtx v = gen_reg_rtx (V16QImode);
rtx v1 = gen_reg_rtx (V16QImode);
emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC target "+nocssc+sve"
+
+/*
+** h128:
+** ldr q([0-9]+), \[x0\]
+** ptrue p([0-9]+).b, vl16
+** cnt z([0-9]+).d, p\2/m, z\1.d
+** addp d([0-9]+), v\3.2d
+** fmov x0, d\4
+** ret
+*/
+
+unsigned h128 (const unsigned __int128 *a) {
+ return __builtin_popcountg (a[0]);
+}
+
+/* There should be only one POPCOUNT. */
+/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " __builtin_popcount" "optimized" } } */
+