From: H.J. Lu Date: Wed, 24 Jul 2019 21:48:33 +0000 (-0700) Subject: x86-64: Compile branred.c with -mprefer-vector-width=128 [BZ #24603] X-Git-Tag: glibc-2.30~9 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7e681561a3aea7aa8f21fb031a7c778147dfdf5b;p=thirdparty%2Fglibc.git x86-64: Compile branred.c with -mprefer-vector-width=128 [BZ #24603] When compiled with -O3 and AVX, GCC 8 and 9 optimize some loops in sysdeps/ieee754/dbl-64/branred.c with 256-bit vector instructions, which leads to store forward stall: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579 There is no easy fix in compiler. This patch limits vector width to 128 bits to work around this issue. It improves performance of sin and cos by more than 40% on Skylake compiled with -O3 -march=skylake. Tested with GCC 7/8/9 on x86-64. [BZ #24603] * sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128 works. * sysdeps/x86_64/configure: Regenerated. * sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New. Set to -mprefer-vector-width=128 if supported. --- diff --git a/ChangeLog b/ChangeLog index 88108d1e8be..31a6b38bd55 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2019-07-24 H.J. Lu + + [BZ #24603] + * sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128 + works. + * sysdeps/x86_64/configure: Regenerated. + * sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New. Set + to -mprefer-vector-width=128 if supported. + 2019-07-24 Florian Weimer * scripts/build-many-glibcs.py (Context.checkout): Default to diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure index 8674d145695..84f82c24068 100644 --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -54,6 +54,28 @@ fi config_vars="$config_vars config-cflags-avx512 = $libc_cv_cc_avx512" +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking -mprefer-vector-width=128" >&5 +$as_echo_n "checking -mprefer-vector-width=128... " >&6; } +if ${libc_cv_cc_mprefer_vector_width+:} false; then : + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mprefer-vector-width=128 -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + libc_cv_cc_mprefer_vector_width=yes +else + libc_cv_cc_mprefer_vector_width=no +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_mprefer_vector_width" >&5 +$as_echo "$libc_cv_cc_mprefer_vector_width" >&6; } +config_vars="$config_vars +config-cflags-mprefer-vector-width = $libc_cv_cc_mprefer_vector_width" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5 $as_echo_n "checking for Intel MPX support... " >&6; } if ${libc_cv_asm_mpx+:} false; then : diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index b7d2c0124fb..cdaba0c075a 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -25,6 +25,15 @@ if test $libc_cv_cc_avx512 = yes; then fi LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512]) +dnl Check if -mprefer-vector-width=128 works. +AC_CACHE_CHECK(-mprefer-vector-width=128, libc_cv_cc_mprefer_vector_width, [dnl +LIBC_TRY_CC_OPTION([-mprefer-vector-width=128], + [libc_cv_cc_mprefer_vector_width=yes], + [libc_cv_cc_mprefer_vector_width=no]) +]) +LIBC_CONFIG_VAR([config-cflags-mprefer-vector-width], + [$libc_cv_cc_mprefer_vector_width]) + dnl Check whether asm supports Intel MPX AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl cat > conftest.s <<\EOF diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index 2b7d69bb504..74b14ba0966 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -237,3 +237,15 @@ CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags) endif endif + +ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) +# When compiled with -O3 -march=skylake, GCC 8 and 9 optimize some loops +# in branred.c with 256-bit vector instructions, which leads to store +# forward stall: +# +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579 +# +# Limit vector width to 128 bits to work around this issue. It improves +# performance of sin and cos by more than 40% on Skylake. +CFLAGS-branred.c = -mprefer-vector-width=128 +endif