;; ---- [INT,FP] Conditional reductions
;; ---- [INT] Tree reductions
;; ---- [FP] Tree reductions
+;; ---- [Predicate] Tree reductions
;; ---- [FP] Left-to-right reductions
;;
;; == Permutes
[(set_attr "sve_type" "sve_fp_reduc")]
)
+;; -------------------------------------------------------------------------
+;; ---- [Predicate] Tree reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - IORV
+;; - XORV
+;; - ANDV
+;; -------------------------------------------------------------------------
+
+;; Unpredicated predicate AND tree reductions.
+;; Invert the predicate and check across all lanes
+;; that the Zero flag is set.
+;;
+;; ptrue p3.b, all
+;; nots p3.b, p3/z, p0.b
+;; cset w0, none
+;;
+(define_expand "reduc_sbool_and_scal_<mode>"
+ [(set (match_operand:QI 0 "register_operand")
+ (unspec:QI [(match_operand:PRED_ALL 1 "register_operand")]
+ UNSPEC_ANDV))]
+ "TARGET_SVE"
+ {
+ rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (<data_bytes>));
+ rtx cast_ptrue = gen_lowpart (<MODE>mode, ptrue);
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_pred_one_cmpl_z (<MODE>mode, tmp, cast_ptrue,
+ operands[1]));
+ emit_insn (
+ gen_aarch64_ptest<mode> (ptrue, cast_ptrue,
+ gen_int_mode (SVE_KNOWN_PTRUE, SImode),
+ tmp));
+ rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+ rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx);
+ rtx tmp2 = gen_reg_rtx (SImode);
+ emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg));
+ emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
+ DONE;
+ }
+)
+
+;; Unpredicated predicate IOR tree reductions.
+;; We need to make sure the results are in the CC flags, so execute a ptest
+;; on the same predicate.
+;;
+;; ptest p0, p0.b
+;; cset w0, any
+;;
+(define_expand "reduc_sbool_ior_scal_<mode>"
+ [(set (match_operand:QI 0 "register_operand")
+ (unspec:QI [(match_operand:PRED_ALL 1 "register_operand")]
+ UNSPEC_IORV))]
+ "TARGET_SVE"
+ {
+ rtx ptrue = lowpart_subreg (VNx16BImode, operands[1], <MODE>mode);
+ emit_insn (
+ gen_aarch64_ptest<mode> (ptrue, operands[1],
+ gen_int_mode (SVE_MAYBE_NOT_PTRUE, SImode),
+ operands[1]));
+ rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+ rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx);
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_aarch64_cstoresi (tmp, cmp, cc_reg));
+ emit_move_insn (operands[0], gen_lowpart (QImode, tmp));
+ DONE;
+ }
+)
+
+;; Unpredicated predicate XOR tree reductions.
+;; Check to see if the number of active lanes in the predicates is a multiple
+;; of 2. This generates:
+;;
+;; cntp x0, p0, p0.b
+;; and w0, w0, 1
+;;
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(set (match_dup 2)
+ (zero_extend:DI
+ (unspec:SI [(match_dup 1)
+ (const_int SVE_MAYBE_NOT_PTRUE)
+ (match_operand:PRED_ALL 1 "register_operand")]
+ UNSPEC_CNTP)))
+ (set (match_dup 4)
+ (and:DI (match_dup 2)
+ (const_int 1)))
+ (set (match_operand:QI 0 "register_operand")
+ (subreg:QI (match_dup 4) 0))]
+ "TARGET_SVE"
+ {
+ operands[2] = gen_reg_rtx (DImode);
+ operands[4] = gen_reg_rtx (DImode);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [FP] Left-to-right reductions
;; -------------------------------------------------------------------------
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+char p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fand (n))
+ __builtin_abort ();
+
+ p[0] = 0;
+ for (int n = 1; n < 77; ++n)
+ if (fand (n))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fior (n))
+ __builtin_abort ();
+
+ p[0] = 1;
+ for (int n = 1; n < 77; ++n)
+ if (!fior (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+short p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fand (n))
+ __builtin_abort ();
+
+ p[0] = 0;
+ for (int n = 1; n < 77; ++n)
+ if (fand (n))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fior (n))
+ __builtin_abort ();
+
+ p[0] = 1;
+ for (int n = 1; n < 77; ++n)
+ if (!fior (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+int p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fand (n))
+ __builtin_abort ();
+
+ p[0] = 0;
+ for (int n = 1; n < 77; ++n)
+ if (fand (n))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fior (n))
+ __builtin_abort ();
+
+ p[0] = 1;
+ for (int n = 1; n < 77; ++n)
+ if (!fior (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+long long p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fand (n))
+ __builtin_abort ();
+
+ p[0] = 0;
+ for (int n = 1; n < 77; ++n)
+ if (fand (n))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fior (n))
+ __builtin_abort ();
+
+ p[0] = 1;
+ for (int n = 1; n < 77; ++n)
+ if (!fior (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+char p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fxort (n) != !(n & 1))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n) != (n & 1))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fxort (n))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+short p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fxort (n) != !(n & 1))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n) != (n & 1))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fxort (n))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+int p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fxort (n) != !(n & 1))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n) != (n & 1))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fxort (n))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
+
+long long p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+int main()
+{
+ __builtin_memset (p, 1, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (fxort (n) != !(n & 1))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n) != (n & 1))
+ __builtin_abort ();
+
+ __builtin_memset (p, 0, sizeof(p));
+
+ for (int n = 0; n < 77; ++n)
+ if (!fxort (n))
+ __builtin_abort ();
+
+ for (int n = 0; n < 77; ++n)
+ if (fxorf (n))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { target { vect_int && vect_condition } } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+char p[128];
+
+/*
+** fand:
+** ...
+** ptrue p[0-9]+.s, all
+** nots p[0-9]+.b, p[0-9]+/z, p[0-9]+.b
+** cset w[0-9]+, none
+** and w[0-9]+, w[0-9]+, w[0-9]+
+** ...
+*/
+bool __attribute__((noipa))
+fand (int n)
+{
+ bool r = true;
+ for (int i = 0; i < n; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+/*
+** fior:
+** ...
+** ptest p[0-9]+, p[0-9]+.b
+** cset w[0-9]+, any
+** orr w[0-9]+, w[0-9]+, w[0-9]+
+** ...
+*/
+bool __attribute__((noipa))
+fior (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+/*
+** fxor:
+** ...
+** cntp x[0-9]+, p[0-9]+, p[0-9]+.h
+** and w[0-9]+, w[0-9]+, 1
+** eor w[0-9]+, w[0-9]+, w[0-9]+
+** ...
+*/
+bool __attribute__((noipa))
+fxor (int n)
+{
+ bool r = false;
+ for (int i = 0; i < n; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 3 "vect" } } */