]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
x86-64: Use _dl_runtime_resolve_opt only with AVX512F [BZ #21871]
authorH.J. Lu <hjl.tools@gmail.com>
Fri, 4 Aug 2017 18:14:19 +0000 (11:14 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Fri, 4 Aug 2017 18:14:33 +0000 (11:14 -0700)
On AVX machines with XGETBV (ECX == 1) like Skylake processors,

(gdb) disass _dl_runtime_resolve_avx_opt
Dump of assembler code for function _dl_runtime_resolve_avx_opt:
   0x0000000000015890 <+0>: push   %rax
   0x0000000000015891 <+1>: push   %rcx
   0x0000000000015892 <+2>: push   %rdx
   0x0000000000015893 <+3>: mov    $0x1,%ecx
   0x0000000000015898 <+8>: xgetbv
   0x000000000001589b <+11>: mov    %eax,%r11d
   0x000000000001589e <+14>: pop    %rdx
   0x000000000001589f <+15>: pop    %rcx
   0x00000000000158a0 <+16>: pop    %rax
   0x00000000000158a1 <+17>: and    $0x4,%r11d
   0x00000000000158a5 <+21>: bnd je 0x16200 <_dl_runtime_resolve_sse_vex>
End of assembler dump.

is slower than:

(gdb) disass _dl_runtime_resolve_avx_slow
Dump of assembler code for function _dl_runtime_resolve_avx_slow:
   0x0000000000015850 <+0>: vorpd  %ymm0,%ymm1,%ymm8
   0x0000000000015854 <+4>: vorpd  %ymm2,%ymm3,%ymm9
   0x0000000000015858 <+8>: vorpd  %ymm4,%ymm5,%ymm10
   0x000000000001585c <+12>: vorpd  %ymm6,%ymm7,%ymm11
   0x0000000000015860 <+16>: vorpd  %ymm8,%ymm9,%ymm9
   0x0000000000015865 <+21>: vorpd  %ymm10,%ymm11,%ymm10
   0x000000000001586a <+26>: vpcmpeqd %xmm8,%xmm8,%xmm8
   0x000000000001586f <+31>: vorpd  %ymm9,%ymm10,%ymm10
   0x0000000000015874 <+36>: vptest %ymm10,%ymm8
   0x0000000000015879 <+41>: bnd jae 0x158b0 <_dl_runtime_resolve_avx>
   0x000000000001587c <+44>: vzeroupper
   0x000000000001587f <+47>: bnd jmpq 0x16200 <_dl_runtime_resolve_sse_vex>
End of assembler dump.
(gdb)

since xgetbv takes much more cycles than single cycle operations like
vpord/vvpcmpeq/ptest.  _dl_runtime_resolve_opt should be used only with
AVX512 where AVX512 instructions lead to lower CPU frequency on Skylake
server.

[BZ #21871]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.

ChangeLog
sysdeps/x86/cpu-features.c

index 768334115004f7d37de3ddee8bb885e31fcd2338..613db927a3917ecb71bfd7fb6a11d3ca956aa501 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-08-04  H.J. Lu  <hongjiu.lu@intel.com>
+
+       [BZ #21871]
+       * sysdeps/x86/cpu-features.c (init_cpu_features): Set
+       bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.
+
 2017-08-04  H.J. Lu  <hongjiu.lu@intel.com>
 
        [BZ #21790]
index 1d087ea732bb921b0abb2af462441d6853d9306f..6f900840d45c5cb384e4e5f738fd4f07237c6b64 100644 (file)
@@ -244,10 +244,13 @@ init_cpu_features (struct cpu_features *cpu_features)
          |= bit_arch_Prefer_No_AVX512;
 
       /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
+         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.
+        Use _dl_runtime_resolve_opt only with AVX512F since it is
+        slower than _dl_runtime_resolve_slow with AVX.  */
       cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
        |= bit_arch_Use_dl_runtime_resolve_slow;
-      if (cpu_features->max_cpuid >= 0xd)
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+         && cpu_features->max_cpuid >= 0xd)
        {
          unsigned int eax;