]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/glibc/glibc-rh1195453-avx512.patch
lcdproc: Update to 0.5.7
[ipfire-2.x.git] / src / patches / glibc / glibc-rh1195453-avx512.patch
CommitLineData
bb330e25
AF
1#
2# Based on AVX-512 support for glibc, but heavaily modified for rhel-6.7.
3# Without assembler support we drop all of the configure checks and simply
4# output using .byte directives the minimal AVX512 instructsion required
5# by the loader. Likewise testing is also impossible, so instead we use
6# the Intel emulator running in `-skx` (Skylake Xeon) emulation mode and
7# verify that a pre-built set of tests passes.
8#
9# commit 6986b98a18490e76b16911d1c6b1ba013598d40d
10# Author: Ulrich Drepper <drepper@gmail.com>
11# Date: Wed Jul 20 14:20:00 2011 -0400
12#
13# Force :a_x86_64_ymm to be 16-byte aligned
14#
15# commit aa4de9cea5c07d43caeaca9722c2d417e9a2919c
16# Author: H.J. Lu <hjl.tools@gmail.com>
17# Date: Fri Mar 14 08:51:25 2014 -0700
18#
19# Check AVX-512 assembler support first
20#
21# It checks AVX-512 assembler support first and sets libc_cv_cc_avx512 to
22# $libc_cv_asm_avx512, instead of yes. GCC won't support AVX-512 if
23# assembler doesn't support it.
24#
25# * sysdeps/x86_64/configure.ac: Check AVX-512 assembler support
26# first. Disable AVX-512 GCC support if assembler doesn't support
27# it.
28# * sysdeps/x86_64/configure: Regenerated.
29#
30# commit 2d63a517e4084ec80403cd9f278690fa8b676cc4
31# Author: Igor Zamyatin <igor.zamyatin@intel.com>
32# Date: Thu Mar 13 11:10:22 2014 -0700
33#
34# Save and restore AVX-512 zmm registers to x86-64 ld.so
35#
36# AVX-512 ISA adds 512-bit zmm registers. This patch updates
37# _dl_runtime_profile to pass zmm registers to run-time audit. It also
38# changes _dl_x86_64_save_sse and _dl_x86_64_restore_sse to upport zmm
39# registers, which are called when only when RTLD_PREPARE_FOREIGN_CALL
40# is used. Its performance impact is minimum.
41#
42# * config.h.in (HAVE_AVX512_SUPPORT): New #undef.
43# (HAVE_AVX512_ASM_SUPPORT): Likewise.
44# * sysdeps/x86_64/bits/link.h (La_x86_64_zmm): New.
45# (La_x86_64_vector): Add zmm.
46# * sysdeps/x86_64/Makefile (tests): Add tst-audit10.
47# (modules-names): Add tst-auditmod10a and tst-auditmod10b.
48# ($(objpfx)tst-audit10): New target.
49# ($(objpfx)tst-audit10.out): Likewise.
50# (tst-audit10-ENV): New.
51# (AVX512-CFLAGS): Likewise.
52# (CFLAGS-tst-audit10.c): Likewise.
53# (CFLAGS-tst-auditmod10a.c): Likewise.
54# (CFLAGS-tst-auditmod10b.c): Likewise.
55# * sysdeps/x86_64/configure.ac: Set config-cflags-avx512,
56# HAVE_AVX512_SUPPORT and HAVE_AVX512_ASM_SUPPORT.
57# * sysdeps/x86_64/configure: Regenerated.
58# * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Add
59# AVX-512 zmm register support.
60# (_dl_x86_64_save_sse): Likewise.
61# (_dl_x86_64_restore_sse): Likewise.
62# * sysdeps/x86_64/dl-trampoline.h: Updated to support different
63# size vector registers.
64# * sysdeps/x86_64/link-defines.sym (YMM_SIZE): New.
65# (ZMM_SIZE): Likewise.
66# * sysdeps/x86_64/tst-audit10.c: New file.
67# * sysdeps/x86_64/tst-auditmod10a.c: Likewise.
68# * sysdeps/x86_64/tst-auditmod10b.c: Likewise.
69#
70# In addition adds:
71# https://sourceware.org/ml/libc-alpha/2014-09/msg00228.html
72# To extend zmm register checking.
73#
74diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h
75--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h 2010-05-04 07:27:23.000000000 -0400
76+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h 2015-03-03 23:03:25.041829238 -0500
77@@ -65,7 +65,10 @@
78 /* Registers for entry into PLT on x86-64. */
79 # if __GNUC_PREREQ (4,0)
80 typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16)));
81-typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32)));
82+typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32),
83+ __aligned__ (16)));
84+typedef double La_x86_64_zmm __attribute__ ((__vector_size__ (64),
85+ __aligned__ (16)));
86 # else
87 typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__)));
88 # endif
89@@ -74,9 +77,10 @@
90 {
91 # if __GNUC_PREREQ (4,0)
92 La_x86_64_ymm ymm[2];
93+ La_x86_64_zmm zmm[1];
94 # endif
95 La_x86_64_xmm xmm[4];
96-} La_x86_64_vector __attribute__ ((aligned(16)));
97+} La_x86_64_vector __attribute__ ((__aligned__(16)));
98
99 typedef struct La_x86_64_regs
100 {
101diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h
102--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:03:05.109457627 -0500
103+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:06:58.434101818 -0500
104@@ -20,14 +20,26 @@
105
106 #ifdef RESTORE_AVX
107 /* This is to support AVX audit modules. */
108- vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp)
109- vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
110- vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
111- vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
112- vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
113- vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
114- vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
115- vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
116+# if HAVE_NO_AVX512_ASM_SUPPORT
117+ /* Restore AVX-512 registers. Use .byte becaues we lack assembler support. */
118+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x44,0x24,0x03 # vmovdqu64 %zmm0,0xc0(%rsp)
119+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x04 # vmovdqu64 %zmm1,0x100(%rsp)
120+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x05 # vmovdqu64 %zmm2,0x140(%rsp)
121+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x06 # vmovdqu64 %zmm3,0x180(%rsp)
122+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x07 # vmovdqu64 %zmm4,0x1c0(%rsp)
123+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x08 # vmovdqu64 %zmm5,0x200(%rsp)
124+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x09 # vmovdqu64 %zmm6,0x240(%rsp)
125+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x0a # vmovdqu64 %zmm7,0x280(%rsp)
126+# else
127+ VMOV %VEC(0), (LR_VECTOR_OFFSET)(%rsp)
128+ VMOV %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
129+ VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
130+ VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
131+ VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
132+ VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
133+ VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
134+ VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
135+# endif
136
137 /* Save xmm0-xmm7 registers to detect if any of them are
138 changed by audit module. */
139@@ -73,7 +85,11 @@
140 je 2f
141 vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
142 jmp 1f
143-2: vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0
144+# if HAVE_NO_AVX512_ASM_SUPPORT
145+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x44,0x24,0x03 # vmovdqu64 0xc0(%rsp),%zmm0
146+# else
147+2: VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
148+# endif
149 vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
150
151 1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
152@@ -82,7 +98,11 @@
153 je 2f
154 vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
155 jmp 1f
156-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
157+# if HAVE_NO_AVX512_ASM_SUPPORT
158+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x04 # vmovdqu64 0x100(%rsp),%zmm1
159+# else
160+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
161+# endif
162 vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
163
164 1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
165@@ -91,7 +111,11 @@
166 je 2f
167 vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
168 jmp 1f
169-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
170+# if HAVE_NO_AVX512_ASM_SUPPORT
171+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x05 # vmovdqu64 0x140(%rsp),%zmm2
172+# else
173+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
174+# endif
175 vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
176
177 1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
178@@ -100,7 +124,11 @@
179 je 2f
180 vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
181 jmp 1f
182-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
183+# if HAVE_NO_AVX512_ASM_SUPPORT
184+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x06 # vmovdqu64 0x180(%rsp),%zmm3
185+# else
186+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
187+# endif
188 vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
189
190 1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
191@@ -109,7 +137,11 @@
192 je 2f
193 vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
194 jmp 1f
195-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
196+# if HAVE_NO_AVX512_ASM_SUPPORT
197+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x07 # vmovdqu64 0x1c0(%rsp),%zmm4
198+# else
199+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
200+# endif
201 vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
202
203 1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
204@@ -118,7 +150,11 @@
205 je 2f
206 vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
207 jmp 1f
208-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
209+# if HAVE_NO_AVX512_ASM_SUPPORT
210+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x08 # vmovdqu64 0x200(%rsp),%zmm5
211+# else
212+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
213+# endif
214 vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
215
216 1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
217@@ -127,7 +163,11 @@
218 je 2f
219 vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
220 jmp 1f
221-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
222+# if HAVE_NO_AVX512_ASM_SUPPORT
223+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x09 # vmovdqu64 0x240(%rsp),%zmm6
224+# else
225+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
226+# endif
227 vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
228
229 1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
230@@ -136,7 +176,11 @@
231 je 2f
232 vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
233 jmp 1f
234-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
235+# if HAVE_NO_AVX512_ASM_SUPPORT
236+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x0a # vmovdqu64 0x280(%rsp),%zmm7
237+# else
238+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
239+# endif
240 vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
241
242 1:
243@@ -214,8 +258,13 @@
244
245 #ifdef RESTORE_AVX
246 /* This is to support AVX audit modules. */
247- vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
248- vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
249+# if HAVE_NO_AVX512_ASM_SUPPORT
250+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x81,0x50,0x00,0x00,0x00 # vmovdqu64 %zmm0,0x50(%rcx)
251+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x89,0x90,0x00,0x00,0x00 # vmovdqu64 %zmm1,0x90(%rcx)
252+# else
253+ VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
254+ VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
255+# endif
256
257 /* Save xmm0/xmm1 registers to detect if they are changed
258 by audit module. */
259@@ -244,13 +293,21 @@
260 vpmovmskb %xmm2, %esi
261 cmpl $0xffff, %esi
262 jne 1f
263- vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
264+# if HAVE_NO_AVX512_ASM_SUPPORT
265+ .byte 0x62,0xf1,0xfe,0x48,0x6f,0x84,0x24,0x50,0x00,0x00,0x00 # vmovdqu64 0x50(%rsp),%zmm0
266+# else
267+ VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
268+# endif
269
270 1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
271 vpmovmskb %xmm2, %esi
272 cmpl $0xffff, %esi
273 jne 1f
274- vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
275+# if HAVE_NO_AVX512_ASM_SUPPORT
276+ .byte 0x62,0xf1,0xfe,0x48,0x6f,0x8c,0x24,0x90,0x00,0x00,0x00 # vmovdqu64 0x90(%rsp),%zmm1
277+# else
278+ VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
279+# endif
280
281 1:
282 #endif
283diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S
284--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:03:05.108457659 -0500
285+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:07:31.799049953 -0500
286@@ -134,7 +134,7 @@
287 .previous
288
289 cmpl $0, L(have_avx)(%rip)
290- jne 1f
291+ jne L(defined)
292 movq %rbx, %r11 # Save rbx
293 movl $1, %eax
294 cpuid
295@@ -143,18 +143,51 @@
296 // AVX and XSAVE supported?
297 andl $((1 << 28) | (1 << 27)), %ecx
298 cmpl $((1 << 28) | (1 << 27)), %ecx
299- jne 2f
300+ jne 10f
301+ // AVX512 supported in processor?
302+ movq %rbx, %r11 # Save rbx
303+ xorl %ecx, %ecx
304+ mov $0x7, %eax
305+ cpuid
306+ andl $(1 << 16), %ebx
307 xorl %ecx, %ecx
308 // Get XFEATURE_ENABLED_MASK
309 xgetbv
310- andl $0x6, %eax
311-2: subl $0x5, %eax
312+ test %ebx, %ebx
313+ movq %r11, %rbx # Restore rbx
314+ je 20f
315+ // Verify that XCR0[7:5] = '111b' and
316+ // XCR0[2:1] = '11b' which means
317+ // that zmm state is enabled
318+ andl $0xe6, %eax
319+ cmpl $0xe6, %eax
320+ jne 20f
321+ movl %eax, L(have_avx)(%rip)
322+L(avx512):
323+# define RESTORE_AVX
324+# define HAVE_NO_AVX512_ASM_SUPPORT 1
325+# define VMOV vmovdqu64
326+# define VEC(i) zmm##i
327+# define MORE_CODE
328+# include "dl-trampoline.h"
329+# undef VMOV
330+# undef VEC
331+# undef RESTORE_AVX
332+# undef HAVE_NO_AVX512_ASM_SUPPORT
333+20: andl $0x6, %eax
334+10: subl $0x5, %eax
335 movl %eax, L(have_avx)(%rip)
336 cmpl $0, %eax
337
338-1: js L(no_avx)
339+L(defined):
340+ js L(no_avx)
341+ cmpl $0xe6, L(have_avx)(%rip)
342+ je L(avx512)
343+
344
345 # define RESTORE_AVX
346+# define VMOV vmovdqu
347+# define VEC(i) ymm##i
348 # define MORE_CODE
349 # include "dl-trampoline.h"
350
351@@ -178,7 +211,7 @@
352 _dl_x86_64_save_sse:
353 # ifdef HAVE_AVX_SUPPORT
354 cmpl $0, L(have_avx)(%rip)
355- jne 1f
356+ jne L(defined_5)
357 movq %rbx, %r11 # Save rbx
358 movl $1, %eax
359 cpuid
360@@ -187,21 +220,37 @@
361 // AVX and XSAVE supported?
362 andl $((1 << 28) | (1 << 27)), %ecx
363 cmpl $((1 << 28) | (1 << 27)), %ecx
364- jne 2f
365+ jne 1f
366+ // AVX512 supported in a processor?
367+ movq %rbx, %r11 # Save rbx
368+ xorl %ecx,%ecx
369+ mov $0x7,%eax
370+ cpuid
371+ andl $(1 << 16), %ebx
372 xorl %ecx, %ecx
373 // Get XFEATURE_ENABLED_MASK
374 xgetbv
375- andl $0x6, %eax
376- cmpl $0x6, %eax
377- // Nonzero if SSE and AVX state saving is enabled.
378- sete %al
379-2: leal -1(%eax,%eax), %eax
380+ test %ebx, %ebx
381+ movq %r11, %rbx # Restore rbx
382+ je 2f
383+ // Verify that XCR0[7:5] = '111b' and
384+ // XCR0[2:1] = '11b' which means
385+ // that zmm state is enabled
386+ andl $0xe6, %eax
387 movl %eax, L(have_avx)(%rip)
388- cmpl $0, %eax
389+ cmpl $0xe6, %eax
390+ je L(avx512_5)
391
392-1: js L(no_avx5)
393+2: andl $0x6, %eax
394+1: subl $0x5, %eax
395+ movl %eax, L(have_avx)(%rip)
396+ cmpl $0, %eax
397
398-# define YMM_SIZE 32
399+L(defined_5):
400+ js L(no_avx5)
401+ cmpl $0xe6, L(have_avx)(%rip)
402+ je L(avx512_5)
403+
404 vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
405 vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
406 vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
407@@ -211,6 +260,26 @@
408 vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
409 vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
410 ret
411+L(avx512_5):
412+# Original instructions:
413+# vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
414+# vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
415+# vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
416+# vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
417+# vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
418+# vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
419+# vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
420+# vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
421+# Assembled instructions:
422+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %zmm0,%fs:0x80
423+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %zmm1,%fs:0xc0
424+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %zmm2,%fs:0x100
425+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %zmm3,%fs:0x140
426+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %zmm4,%fs:0x180
427+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %zmm5,%fs:0x1c0
428+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %zmm6,%fs:0x200
429+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %zmm7,%fs:0x240
430+ ret
431 L(no_avx5):
432 # endif
433 movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
434@@ -234,6 +303,8 @@
435 # ifdef HAVE_AVX_SUPPORT
436 cmpl $0, L(have_avx)(%rip)
437 js L(no_avx6)
438+ cmpl $0xe6, L(have_avx)(%rip)
439+ je L(avx512_6)
440
441 vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
442 vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
443@@ -244,6 +315,26 @@
444 vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
445 vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
446 ret
447+L(avx512_6):
448+# Original instructions:
449+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
450+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
451+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
452+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
453+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
454+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
455+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
456+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
457+# Assembled instructions:
458+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %fs:0x80,%zmm0
459+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %fs:0xc0,%zmm1
460+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %fs:0x100,%zmm2
461+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %fs:0x140,%zmm3
462+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %fs:0x180,%zmm4
463+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %fs:0x1c0,%zmm5
464+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %fs:0x200,%zmm6
465+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %fs:0x240,%zmm7
466+ ret
467 L(no_avx6):
468 # endif
469 movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
470diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym
471--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym 2010-05-04 07:27:23.000000000 -0400
472+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym 2015-03-03 23:03:25.042829206 -0500
473@@ -4,6 +4,8 @@
474 --
475 VECTOR_SIZE sizeof (La_x86_64_vector)
476 XMM_SIZE sizeof (La_x86_64_xmm)
477+YMM_SIZE sizeof (La_x86_64_ymm)
478+ZMM_SIZE sizeof (La_x86_64_zmm)
479
480 LR_SIZE sizeof (struct La_x86_64_regs)
481 LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx)