From: Christophe Lyon Date: Thu, 3 Jun 2021 14:35:50 +0000 (+0000) Subject: arm: Auto-vectorization for MVE: add pack/unpack patterns X-Git-Tag: basepoints/gcc-13~6879 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=046a3beb1673bf4a61c131373b6a5e84158e92bf;p=thirdparty%2Fgcc.git arm: Auto-vectorization for MVE: add pack/unpack patterns This patch adds vec_unpack_hi_, vec_unpack_lo_, vec_pack_trunc_ patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_). The patch introduces mve_vec_unpack_lo_ and mve_vec_unpack_hi_ which are similar to their Neon counterparts, except for the assembly syntax. The patch introduces mve_vec_pack_trunc_lo_ to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16 q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16 q1, [r0] which could be improved to vldrh.16 q3, [r1] vclz.i16 q1, q3 vstrh.16 q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32 q0, q0 vstrh.32 q0, [r0] vclz.i32 q0, q1 vstrh.32 q0, [r0, #8] 2021-06-11 Christophe Lyon gcc/ * config/arm/mve.md (mve_vec_unpack_lo_): New pattern. (mve_vec_unpack_hi_): New pattern. (@mve_vec_pack_trunc_lo_): New pattern. (mve_vmovntq_): Prefix with '@'. * config/arm/neon.md (vec_unpack_hi_): Move to vec-common.md. (vec_unpack_lo_): Likewise. (vec_pack_trunc_): Rename to neon_quad_vec_pack_trunc_. * config/arm/vec-common.md (vec_unpack_hi_): New pattern. (vec_unpack_lo_): New. (vec_pack_trunc_): New. gcc/testsuite/ * gcc.target/arm/simd/mve-vclz.c: Update expected results. * gcc.target/arm/simd/mve-vshl.c: Likewise. * gcc.target/arm/simd/mve-vec-pack.c: New test. * gcc.target/arm/simd/mve-vec-unpack.c: New test. --- diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 99e46d0bc69e..e393518ea882 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -535,6 +535,26 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vec_unpack_lo_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:MVE_3 1 "register_operand" "w") + (match_operand:MVE_3 2 "vect_par_constant_low" ""))))] + "TARGET_HAVE_MVE" + "vmovlb.%# %q0, %q1" + [(set_attr "type" "mve_move")] +) + +(define_insn "mve_vec_unpack_hi_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:MVE_3 1 "register_operand" "w") + (match_operand:MVE_3 2 "vect_par_constant_high" ""))))] + "TARGET_HAVE_MVE" + "vmovlt.%# %q0, %q1" + [(set_attr "type" "mve_move")] +) + ;; ;; [vcvtpq_s, vcvtpq_u]) ;; @@ -2199,10 +2219,23 @@ [(set_attr "type" "mve_move") ]) +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the +;; need for an uninitialized input operand. +(define_insn "@mve_vec_pack_trunc_lo_" + [ + (set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:MVE_5 1 "s_register_operand" "w")] + VMOVNBQ_S)) + ] + "TARGET_HAVE_MVE" + "vmovnb.i%# %q0, %q1" + [(set_attr "type" "mve_move") +]) + ;; ;; [vmovntq_s, vmovntq_u]) ;; -(define_insn "mve_vmovntq_" +(define_insn "@mve_vmovntq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 0fdffaf4ec48..392d96079191 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5924,43 +5924,6 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_shift_imm_long")] ) -(define_expand "vec_unpack_hi_" - [(match_operand: 0 "register_operand") - (SE: (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (/2) ; - rtx t1; - int i; - for (i = 0; i < (/2); i++) - RTVEC_ELT (v, i) = GEN_INT ((/2) + i); - - t1 = gen_rtx_PARALLEL (mode, v); - emit_insn (gen_neon_vec_unpack_hi_ (operands[0], - operands[1], - t1)); - DONE; - } -) - -(define_expand "vec_unpack_lo_" - [(match_operand: 0 "register_operand") - (SE: (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (/2) ; - rtx t1; - int i; - for (i = 0; i < (/2) ; i++) - RTVEC_ELT (v, i) = GEN_INT (i); - t1 = gen_rtx_PARALLEL (mode, v); - emit_insn (gen_neon_vec_unpack_lo_ (operands[0], - operands[1], - t1)); - DONE; - } -) - (define_insn "neon_vec_mult_lo_" [(set (match_operand: 0 "register_operand" "=w") (mult: (SE: (vec_select: @@ -6176,7 +6139,7 @@ if (BYTES_BIG_ENDIAN) ; because the ordering of vector elements in Q registers is different from what ; the semantics of the instructions require. -(define_insn "vec_pack_trunc_" +(define_insn "neon_quad_vec_pack_trunc_" [(set (match_operand: 0 "register_operand" "=&w") (vec_concat: (truncate: diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 430a92ce966d..f90afa4cdb91 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -632,3 +632,73 @@ "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack_hi_" + [(set (match_operand: 0 "register_operand") + (SE: (vec_select: + (match_operand:VU 1 "register_operand") + (match_dup 2))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (/2); + int i; + for (i = 0; i < (/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); + + operands[2] = gen_rtx_PARALLEL (mode, v); + } +) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack_lo_" + [(set (match_operand: 0 "register_operand") + (SE: (vec_select: + (match_operand:VU 1 "register_operand") + (match_dup 2))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (/2); + int i; + for (i = 0; i < (/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + + operands[2] = gen_rtx_PARALLEL (mode, v); + + } +) + +;; vmovn[tb] are not available for V2DI on MVE +(define_expand "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand") + (vec_concat: + (truncate: + (match_operand:VN 1 "register_operand")) + (truncate: + (match_operand:VN 2 "register_operand"))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V2DImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + if (TARGET_NEON) + { + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], operands[1], + operands[2])); + } + else + { + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, operands[1])); + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, + operands[0], tmpreg, operands[2])); + } + DONE; + } +) diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c index 7068736bc283..5d6e991cfc6d 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c @@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz) FUNC(s, int, 8, clz) FUNC(u, uint, 8, clz) -/* 16 and 8-bit versions are not vectorized because they need pack/unpack - patterns since __builtin_clz uses 32-bit parameter and return value. */ -/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 2 } } */ +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for + instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of + 14 vclz.i32 expected in this testcase. */ +/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 14 } } */ /* { dg-final { scan-assembler-times {vclz\.i16 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {vclz\.i8 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c new file mode 100644 index 000000000000..43642b2fec58 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O3" } */ + +#include + +#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a) { \ + int i; \ + for (i=0; i < (256 / BITS); i++) { \ + dest[i] = a[i]; \ + } \ + } + +FUNC(s, int, 16, 32, pack) +FUNC(u, uint, 16, 32, pack) +FUNC(s, int, 8, 16, pack) +FUNC(u, uint, 8, 16, pack) + +/* { dg-final { scan-assembler-times {vmovnt\.i32\tq[0-9]+, q[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vmovnb\.i32\tq[0-9]+, q[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vmovnt\.i16\tq[0-9]+, q[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vmovnb\.i16\tq[0-9]+, q[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-not {vldr\.64\td[0-9]+, \.L} } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c new file mode 100644 index 000000000000..cdc62f854ad1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O3" } */ + +#include + +#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a) { \ + int i; \ + for (i=0; i < (128 / BITS); i++) { \ + dest[i] = a[i]; \ + } \ + } + +FUNC(s, int, 32, 16, unpack) +FUNC(u, uint, 32, 16, unpack) +FUNC(s, int, 16, 8, unpack) +FUNC(u, uint, 16, 8, unpack) + +/* { dg-final { scan-assembler-times {vmovlt\.s16 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlb\.s16 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlt\.u16 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlb\.u16 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlt\.s8 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlb\.s8 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlt\.u8 q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmovlb\.u8 q[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c index 7a0644997c8b..91dd942d818f 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c @@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm) /* MVE has only 128-bit vectors, so we can vectorize only half of the functions above. */ /* We only emit vshl.u, which is equivalent to vshl.s anyway. */ -/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */ +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for + instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of + 14 vshl.i32 expected in this testcase. */ +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */ /* We emit vshl.i when the shift amount is an immediate. */ /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */