]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]
authorH.J. Lu <hjl.tools@gmail.com>
Tue, 21 Mar 2017 17:59:31 +0000 (10:59 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Fri, 7 Apr 2017 17:06:58 +0000 (10:06 -0700)
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
the first 8 vector registers.  The code layout is

  if only %xmm0 - %xmm7 registers are used
     preserve %xmm0 - %xmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %zmm0 - %zmm7 registers

Branch predication always executes the fallthrough code path to preserve
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
registers are used.  This leads to lower CPU frequency on Skylake
server.  This patch changes the fallthrough code path to preserve
%xmm0 - %xmm7 registers instead:

  if whole %zmm0 - %zmm7 registers are used
    preserve %zmm0 - %zmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %xmm0 - %xmm7 registers

Tested on Skylake server.

[BZ #21258]
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
Define only if _dl_runtime_resolve is defined to
_dl_runtime_resolve_sse_vex.
* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
Fallthrough to _dl_runtime_resolve_sse_vex.

(cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00)

ChangeLog
sysdeps/x86_64/dl-trampoline.S
sysdeps/x86_64/dl-trampoline.h

index 9047f652fe16205de8905b9d978ff70277e7dd22..6e4696c7dfc681967024af92de13df1d02dc408d 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-04-07  H.J. Lu  <hongjiu.lu@intel.com>
+
+       [BZ #21258]
+       * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
+       Define only if _dl_runtime_resolve is defined to
+       _dl_runtime_resolve_sse_vex.
+       * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
+       Fallthrough to _dl_runtime_resolve_sse_vex.
+
 2017-04-03  Mike Frysinger  <vapier@gentoo.org>
 
        [BZ #21253]
index 39f595e1e1855cfffe7f5b3c643cc893d5415684..50b23633e3bd88d4faf47321c91b38f2d8ae104b 100644 (file)
 #endif
 #define VEC(i)                 zmm##i
 #define _dl_runtime_resolve    _dl_runtime_resolve_avx512
-#define _dl_runtime_resolve_opt        _dl_runtime_resolve_avx512_opt
 #define _dl_runtime_profile    _dl_runtime_profile_avx512
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
-#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
 # define VMOV                  vmovdqu
 #endif
 #define _dl_runtime_resolve    _dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt        _dl_runtime_resolve_avx512_opt
 #include "dl-trampoline.h"
index abe4471c1de87a953cb49c5a272d3041c39b5b47..32ad3af2027073ed8699b6a91530b2060101da17 100644 (file)
@@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
        # YMM state isn't in use.
        PRESERVE_BND_REGS_PREFIX
        jz _dl_runtime_resolve_sse_vex
-# elif VEC_SIZE == 64
+# elif VEC_SIZE == 16
        # For ZMM registers, check if YMM state and ZMM state are in
        # use.
        andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
        cmpl $bit_YMM_state, %r11d
-       # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
-       # neither YMM state nor ZMM state are in use.
+       # Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
        PRESERVE_BND_REGS_PREFIX
-       jl _dl_runtime_resolve_sse_vex
+       jg _dl_runtime_resolve_avx512
        # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
        # ZMM state isn't in use.
        PRESERVE_BND_REGS_PREFIX
        je _dl_runtime_resolve_avx
+       # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+       # neither YMM state nor ZMM state are in use.
 # else
 #  error Unsupported VEC_SIZE!
 # endif