From: Tamar Christina Date: Wed, 22 Oct 2025 09:52:43 +0000 (+0100) Subject: AArch64: Add support for boolean reductions for Adv. SIMD using SVE X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=678ec7143b85e445c8c8c2258d852a09a0df23cb;p=thirdparty%2Fgcc.git AArch64: Add support for boolean reductions for Adv. SIMD using SVE When doing boolean reductions for Adv. SIMD vectors and SVE is available we can use SVE instructions instead of Adv. SIMD ones to do the reduction. For instance OR-reductions are umaxp v3.4s, v3.4s, v3.4s fmov x1, d3 cmp x1, 0 cset w0, ne and with SVE we generate: ptrue p1.b, vl16 cmpne p1.b, p1/z, z3.b, #0 cset w0, any Where the ptrue is normally executed much earlier so it's not a bottleneck for the compare. For the remaining codegen see test vect-reduc-bool-18.c. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (reduc_sbool_and_scal_, reduc_sbool_ior_scal_, reduc_sbool_xor_scal_): Use SVE if available. * config/aarch64/aarch64-sve.md (*cmp_ptest): Rename ... (@aarch64_pred_cmp_ptest): ... To this. (reduc_sbool_xor_scal_): Rename ... (@reduc_sbool_xor_scal_): ... To this. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/vect-reduc-bool-10.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-11.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-12.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-13.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-14.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-15.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-16.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-17.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-18.c: New test. --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 648a42f7d0f..a121a18f9a0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3477,12 +3477,41 @@ ;; cmn x1, #1 ;; cset w0, eq ;; +;; or with SVE enabled +;; +;; ptrue p1.b, vl16 +;; cmpeq p0.b, p1/z, z1.b, #0 +;; cset w0, none +;; (define_expand "reduc_sbool_and_scal_" [(set (match_operand:QI 0 "register_operand") (unspec:QI [(match_operand:VALLI 1 "register_operand")] UNSPEC_ANDV))] "TARGET_SIMD" { + if (TARGET_SVE) + { + machine_mode full_mode = aarch64_full_sve_mode (mode).require (); + rtx in = force_lowpart_subreg (full_mode, operands[1], mode); + unsigned lanes + = exact_div (GET_MODE_BITSIZE (mode), 8).to_constant (); + machine_mode pred_mode = aarch64_sve_pred_mode (full_mode); + rtx pred_res = gen_reg_rtx (pred_mode); + rtx gp = aarch64_ptrue_reg (VNx16BImode, lanes); + rtx cast_gp = lowpart_subreg (pred_mode, gp, VNx16BImode); + rtx gp_flag = gen_int_mode (SVE_MAYBE_NOT_PTRUE, SImode); + emit_insn ( + gen_aarch64_pred_cmp_ptest (EQ, full_mode, pred_res, gp, in, + CONST0_RTX (full_mode), cast_gp, + gp_flag, cast_gp, gp_flag)); + rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM); + rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx); + rtx tmp2 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; + } + rtx tmp = operands[1]; /* 128-bit vectors need to be compressed to 64-bits first. */ if (known_eq (128, GET_MODE_BITSIZE (mode))) @@ -3511,12 +3540,41 @@ ;; cmp x1, 0 ;; cset w0, ne ;; +;; or with SVE enabled +;; +;; ptrue p1.b, vl16 +;; cmpne p0.b, p1/z, z1.b, #0 +;; cset w0, any +;; (define_expand "reduc_sbool_ior_scal_" [(set (match_operand:QI 0 "register_operand") (unspec:QI [(match_operand:VALLI 1 "register_operand")] UNSPEC_IORV))] "TARGET_SIMD" { + if (TARGET_SVE) + { + machine_mode full_mode = aarch64_full_sve_mode (mode).require (); + rtx in = force_lowpart_subreg (full_mode, operands[1], mode); + unsigned lanes + = exact_div (GET_MODE_BITSIZE (mode), 8).to_constant (); + machine_mode pred_mode = aarch64_sve_pred_mode (full_mode); + rtx pred_res = gen_reg_rtx (pred_mode); + rtx gp = aarch64_ptrue_reg (VNx16BImode, lanes); + rtx cast_gp = lowpart_subreg (pred_mode, gp, VNx16BImode); + rtx gp_flag = gen_int_mode (SVE_MAYBE_NOT_PTRUE, SImode); + emit_insn ( + gen_aarch64_pred_cmp_ptest (NE, full_mode, pred_res, gp, in, + CONST0_RTX (full_mode), cast_gp, + gp_flag, cast_gp, gp_flag)); + rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM); + rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx); + rtx tmp2 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; + } + rtx tmp = operands[1]; /* 128-bit vectors need to be compressed to 64-bits first. */ if (known_eq (128, GET_MODE_BITSIZE (mode))) @@ -3547,12 +3605,37 @@ ;; fmov w1, s3 ;; and w0, w1, 1 ;; +;; or with SVE enabled +;; +;; ptrue p1.b, vl16 +;; cmpne p0.b, p1/z, z1+.b, #0 +;; cntp x1, p0, p0.b +;; and w0, w1, 1 +;; (define_expand "reduc_sbool_xor_scal_" [(set (match_operand:QI 0 "register_operand") (unspec:QI [(match_operand:VALLI 1 "register_operand")] UNSPEC_XORV))] "TARGET_SIMD" { + if (TARGET_SVE) + { + machine_mode full_mode = aarch64_full_sve_mode (mode).require (); + rtx in = force_lowpart_subreg (full_mode, operands[1], mode); + unsigned lanes + = exact_div (GET_MODE_BITSIZE (mode), 8).to_constant (); + machine_mode pred_mode = aarch64_sve_pred_mode (full_mode); + rtx pred_res = gen_reg_rtx (pred_mode); + rtx gp = aarch64_ptrue_reg (VNx16BImode, lanes); + rtx cast_gp = lowpart_subreg (pred_mode, gp, VNx16BImode); + rtx gp_flag = gen_int_mode (SVE_MAYBE_NOT_PTRUE, SImode); + emit_insn ( + gen_aarch64_pred_cmp (NE, full_mode, pred_res, cast_gp, gp_flag, in, + CONST0_RTX (full_mode))); + emit_insn (gen_reduc_sbool_xor_scal (pred_mode, operands[0], pred_res)); + DONE; + } + rtx tmp = gen_reg_rtx (mode); rtx one_reg = force_reg (mode, CONST1_RTX (mode)); emit_move_insn (tmp, gen_rtx_AND (mode, operands[1], one_reg)); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 047c16f974a..f459f63d6bb 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8745,7 +8745,7 @@ ;; Predicated integer comparisons in which only the flags result is ;; interesting. -(define_insn_and_rewrite "*cmp_ptest" +(define_insn_and_rewrite "@aarch64_pred_cmp_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC [(match_operand:VNx16BI 1 "register_operand") @@ -9963,7 +9963,7 @@ ;; cntp x0, p0, p0.b ;; and w0, w0, 1 ;; -(define_expand "reduc_sbool_xor_scal_" +(define_expand "@reduc_sbool_xor_scal_" [(set (match_dup 2) (zero_extend:DI (unspec:SI [(match_dup 1) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-10.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-10.c new file mode 100644 index 00000000000..c0ff50e914b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-10.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-11.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-11.c new file mode 100644 index 00000000000..3597fc4a456 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-11.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-12.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-12.c new file mode 100644 index 00000000000..b1173627403 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-12.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-13.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-13.c new file mode 100644 index 00000000000..a2b8a712043 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-13.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-14.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-14.c new file mode 100644 index 00000000000..c24e13294fe --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-14.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-15.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-15.c new file mode 100644 index 00000000000..0233b8ae233 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-15.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-16.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-16.c new file mode 100644 index 00000000000..e731b556424 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-16.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-17.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-17.c new file mode 100644 index 00000000000..efbec019bf7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-17.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { target { vect_int && vect_condition } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-18.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-18.c new file mode 100644 index 00000000000..a47c306e13c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-18.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=asimd-only -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/ +/* { dg-final { check-function-bodies "**" "" } } */ + +char p[128]; + +/* +** fand: +** ... +** ptrue p[0-9]+.b, vl16 +** cmpeq p[0-9]+.b, p[0-9]+/z, z[0-9]+.b, #0 +** cset w[0-9]+, none +** ... +*/ +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +/* +** fior: +** ... +** ptrue p[0-9]+.b, vl16 +** cmpne p[0-9]+.b, p[0-9]+/z, z[0-9]+.b, #0 +** cset w[0-9]+, any +** ... +*/ +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +/* +** fxor: +** ... +** ptrue p[0-9]+.b, vl16 +** cmpne p[0-9]+.b, p[0-9]+/z, z[0-9]+.b, #0 +** cntp x[0-9]+, p[0-9]+, p[0-9]+.b +** and w[0-9]+, w[0-9]+, 1 +** ... +*/ +bool __attribute__((noipa)) +fxor (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 3 "vect" } } */ +