From 5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Jan 2015 06:50:20 -0800 Subject: [PATCH] Use AVX unaligned memcpy only if AVX2 is available memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. --- ChangeLog | 18 ++++++++++++++++++ NEWS | 4 ++-- sysdeps/x86_64/multiarch/init-arch.c | 9 +++++++-- sysdeps/x86_64/multiarch/init-arch.h | 4 ++++ sysdeps/x86_64/multiarch/memcpy.S | 2 +- sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +- sysdeps/x86_64/multiarch/memmove.c | 2 +- sysdeps/x86_64/multiarch/memmove_chk.c | 2 +- sysdeps/x86_64/multiarch/mempcpy.S | 2 +- sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +- 10 files changed, 37 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index 26f7f3f3b18..a696e396b26 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2015-01-30 H.J. Lu + + [BZ #17801] + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): + New. + (index_AVX_Fast_Unaligned_Load): Likewise. + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. + 2015-01-29 Andreas Schwab * sysdeps/nptl/allocrtsig.c: Include . diff --git a/NEWS b/NEWS index 8e2729bddd6..c91b9fc58a5 100644 --- a/NEWS +++ b/NEWS @@ -17,8 +17,8 @@ Version 2.21 17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653, 17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722, 17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748, - 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803, - 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. + 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801, + 17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. * A new semaphore algorithm has been implemented in generic C code for all machines. Previous custom assembly implementations of semaphore were diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 9299360612f..7dec21884dc 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -171,9 +171,14 @@ __init_cpu_features (void) /* Determine if AVX is usable. */ if (CPUID_AVX) __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if AVX2 is usable. */ +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; + __cpu_features.feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; /* Determine if FMA is usable. */ if (CPUID_FMA) __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 55f1c5b34cb..e6b5ba5530f 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -25,6 +25,7 @@ #define bit_FMA4_Usable (1 << 8) #define bit_Slow_SSE4_2 (1 << 9) #define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) /* CPUID Feature flags. */ @@ -74,6 +75,7 @@ # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 992e40db81d..4e18cd30704 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 1f ret 1: leaq __memcpy_sse2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 5e9cf004b08..1e756ea0c23 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index d93bfd05c0b..dd153a3eaa2 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 743ca2a460f..8b12d002dcb 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX ? __memmove_chk_avx_unaligned : + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index cdf1dab62b6..2eaacdf0492 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -37,7 +37,7 @@ ENTRY(__mempcpy) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index b7f9e89ea24..17b84701b02 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret -- 2.39.5