From: Sylvia Taylor Date: Thu, 18 Jul 2019 15:42:13 +0000 (+0000) Subject: [patch1/2][arm][PR90317]: fix sha1 patterns X-Git-Tag: misc/cutover-git~3942 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e38341a8e0c7f89eb2146feddea8c2f3bf25a331;p=thirdparty%2Fgcc.git [patch1/2][arm][PR90317]: fix sha1 patterns This patch fixes: 1) Ice message thrown when using the crypto_sha1h intrinsic due to incompatible mode used for zero_extend. Removed zero extend as it is not a good choice for vector modes and using an equivalent single mode like TI (128bits) instead of V4SI produces extra instructions making it inefficient. This affects gcc version 8 and above. 2) Incorrect combine optimizations made due to vec_select usage in the sha1 patterns on arm. The patterns should only combine a vec select within a sha1h instruction when the lane is 0. This affects gcc version 5 and above. - Fixed by explicitly declaring the valid const int for such optimizations. For cases when the lane is not 0, the vector lane selection now occurs in a e.g. vmov instruction prior to sha1h. - Updated the sha1h testcases on arm to check for additional cases with custom vector lane selection. The intrinsic functions for the sha1 patterns have also been simplified which seems to eliminate extra vmovs like: - vmov.i32 q8, #0. 2019-07-18 Sylvia Taylor PR target/90317 * config/arm/arm_neon.h (vsha1h_u32): Refactor. (vsha1cq_u32): Likewise. (vsha1pq_u32): Likewise. (vsha1mq_u32): Likewise. * config/arm/crypto.md: (crypto_sha1h): Remove zero extend, correct vec select. (crypto_sha1c): Correct vec select. (crypto_sha1m): Likewise. (crypto_sha1p): Likewise. * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1C_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1H_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1M_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1P_VEC_SELECT): New. From-SVN: r273574 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a4a625e7eb03..668dc40b7fab 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2019-07-18 Sylvia Taylor + + PR target/90317 + * config/arm/arm_neon.h + (vsha1h_u32): Refactor. + (vsha1cq_u32): Likewise. + (vsha1pq_u32): Likewise. + (vsha1mq_u32): Likewise. + * config/arm/crypto.md: + (crypto_sha1h): Remove zero extend, correct vec select. + (crypto_sha1c): Correct vec select. + (crypto_sha1m): Likewise. + (crypto_sha1p): Likewise. + 2019-07-18 Richard Earnshaw * config/arm/predicates.md (arm_borrow_operation): New predicate. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 6b982392ece6..1f200d491d1d 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1h_u32 (uint32_t __hash_e) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - __t = __builtin_arm_crypto_sha1h (__t); - return vgetq_lane_u32 (__t, 0); + return vgetq_lane_u32 (__builtin_arm_crypto_sha1h (vdupq_n_u32 (__hash_e)), + 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1c (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1p (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1m (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md index bf34f69fc75c..115c515ac469 100644 --- a/gcc/config/arm/crypto.md +++ b/gcc/config/arm/crypto.md @@ -105,14 +105,18 @@ [(set_attr "type" "")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_sha1h" [(set (match_operand:V4SI 0 "register_operand" "=w") - (zero_extend:V4SI - (unspec:SI [(vec_select:SI - (match_operand:V4SI 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] - UNSPEC_SHA1H)))] - "TARGET_CRYPTO" + (unspec:V4SI + [(vec_select:SI + (match_operand:V4SI 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] + UNSPEC_SHA1H))] + "TARGET_CRYPTO && INTVAL (operands[2]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" "sha1h.32\\t%q0, %q1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -127,6 +131,10 @@ [(set_attr "type" "crypto_pmull")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_" [(set (match_operand:V4SI 0 "register_operand" "=w") (unspec: @@ -136,7 +144,7 @@ (parallel [(match_operand:SI 4 "immediate_operand" "i")])) (match_operand: 3 "register_operand" "w")] CRYPTO_SELECTING))] - "TARGET_CRYPTO" + "TARGET_CRYPTO && INTVAL (operands[4]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" ".\\t%q0, %q2, %q3" [(set_attr "type" "")] ) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0f47604da85a..7bf322fc1822 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,19 @@ +2019-07-18 Sylvia Taylor + + PR target/90317 + * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1C_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1H_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1M_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1P_VEC_SELECT): New. + 2019-07-18 Jan Hubicka * g++.dg/lto/alias-5_0.C: New testcase. diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c index 4dc9dee6617e..41f97a74d6f8 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1c.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1cq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1C_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1C_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1c.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c index dee277485247..b2846675a276 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c @@ -1,14 +1,31 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t val = 0xdeadbeef; return vsha1h_u32 (val); } -/* { dg-final { scan-assembler "sha1h.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32_t foo_lane##lane (uint32x4_t val) \ + { \ + return vsha1h_u32 (vgetq_lane_u32 (val, lane)); \ + } + +#define TEST_SHA1H_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1H_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1h.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 8 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c index 672b93a97475..676e64ce779c 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1m.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1mq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1M_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1M_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1m.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c index ff508e0dc7f6..ed10fe265ba7 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1p.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1pq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1P_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1P_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1p.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */