]>
Commit | Line | Data |
---|---|---|
bb330e25 AF |
1 | # |
2 | # Based on AVX-512 support for glibc, but heavaily modified for rhel-6.7. | |
3 | # Without assembler support we drop all of the configure checks and simply | |
4 | # output using .byte directives the minimal AVX512 instructsion required | |
5 | # by the loader. Likewise testing is also impossible, so instead we use | |
6 | # the Intel emulator running in `-skx` (Skylake Xeon) emulation mode and | |
7 | # verify that a pre-built set of tests passes. | |
8 | # | |
9 | # commit 6986b98a18490e76b16911d1c6b1ba013598d40d | |
10 | # Author: Ulrich Drepper <drepper@gmail.com> | |
11 | # Date: Wed Jul 20 14:20:00 2011 -0400 | |
12 | # | |
13 | # Force :a_x86_64_ymm to be 16-byte aligned | |
14 | # | |
15 | # commit aa4de9cea5c07d43caeaca9722c2d417e9a2919c | |
16 | # Author: H.J. Lu <hjl.tools@gmail.com> | |
17 | # Date: Fri Mar 14 08:51:25 2014 -0700 | |
18 | # | |
19 | # Check AVX-512 assembler support first | |
20 | # | |
21 | # It checks AVX-512 assembler support first and sets libc_cv_cc_avx512 to | |
22 | # $libc_cv_asm_avx512, instead of yes. GCC won't support AVX-512 if | |
23 | # assembler doesn't support it. | |
24 | # | |
25 | # * sysdeps/x86_64/configure.ac: Check AVX-512 assembler support | |
26 | # first. Disable AVX-512 GCC support if assembler doesn't support | |
27 | # it. | |
28 | # * sysdeps/x86_64/configure: Regenerated. | |
29 | # | |
30 | # commit 2d63a517e4084ec80403cd9f278690fa8b676cc4 | |
31 | # Author: Igor Zamyatin <igor.zamyatin@intel.com> | |
32 | # Date: Thu Mar 13 11:10:22 2014 -0700 | |
33 | # | |
34 | # Save and restore AVX-512 zmm registers to x86-64 ld.so | |
35 | # | |
36 | # AVX-512 ISA adds 512-bit zmm registers. This patch updates | |
37 | # _dl_runtime_profile to pass zmm registers to run-time audit. It also | |
38 | # changes _dl_x86_64_save_sse and _dl_x86_64_restore_sse to upport zmm | |
39 | # registers, which are called when only when RTLD_PREPARE_FOREIGN_CALL | |
40 | # is used. Its performance impact is minimum. | |
41 | # | |
42 | # * config.h.in (HAVE_AVX512_SUPPORT): New #undef. | |
43 | # (HAVE_AVX512_ASM_SUPPORT): Likewise. | |
44 | # * sysdeps/x86_64/bits/link.h (La_x86_64_zmm): New. | |
45 | # (La_x86_64_vector): Add zmm. | |
46 | # * sysdeps/x86_64/Makefile (tests): Add tst-audit10. | |
47 | # (modules-names): Add tst-auditmod10a and tst-auditmod10b. | |
48 | # ($(objpfx)tst-audit10): New target. | |
49 | # ($(objpfx)tst-audit10.out): Likewise. | |
50 | # (tst-audit10-ENV): New. | |
51 | # (AVX512-CFLAGS): Likewise. | |
52 | # (CFLAGS-tst-audit10.c): Likewise. | |
53 | # (CFLAGS-tst-auditmod10a.c): Likewise. | |
54 | # (CFLAGS-tst-auditmod10b.c): Likewise. | |
55 | # * sysdeps/x86_64/configure.ac: Set config-cflags-avx512, | |
56 | # HAVE_AVX512_SUPPORT and HAVE_AVX512_ASM_SUPPORT. | |
57 | # * sysdeps/x86_64/configure: Regenerated. | |
58 | # * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Add | |
59 | # AVX-512 zmm register support. | |
60 | # (_dl_x86_64_save_sse): Likewise. | |
61 | # (_dl_x86_64_restore_sse): Likewise. | |
62 | # * sysdeps/x86_64/dl-trampoline.h: Updated to support different | |
63 | # size vector registers. | |
64 | # * sysdeps/x86_64/link-defines.sym (YMM_SIZE): New. | |
65 | # (ZMM_SIZE): Likewise. | |
66 | # * sysdeps/x86_64/tst-audit10.c: New file. | |
67 | # * sysdeps/x86_64/tst-auditmod10a.c: Likewise. | |
68 | # * sysdeps/x86_64/tst-auditmod10b.c: Likewise. | |
69 | # | |
70 | # In addition adds: | |
71 | # https://sourceware.org/ml/libc-alpha/2014-09/msg00228.html | |
72 | # To extend zmm register checking. | |
73 | # | |
74 | diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h | |
75 | --- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h 2010-05-04 07:27:23.000000000 -0400 | |
76 | +++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h 2015-03-03 23:03:25.041829238 -0500 | |
77 | @@ -65,7 +65,10 @@ | |
78 | /* Registers for entry into PLT on x86-64. */ | |
79 | # if __GNUC_PREREQ (4,0) | |
80 | typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16))); | |
81 | -typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32))); | |
82 | +typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32), | |
83 | + __aligned__ (16))); | |
84 | +typedef double La_x86_64_zmm __attribute__ ((__vector_size__ (64), | |
85 | + __aligned__ (16))); | |
86 | # else | |
87 | typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__))); | |
88 | # endif | |
89 | @@ -74,9 +77,10 @@ | |
90 | { | |
91 | # if __GNUC_PREREQ (4,0) | |
92 | La_x86_64_ymm ymm[2]; | |
93 | + La_x86_64_zmm zmm[1]; | |
94 | # endif | |
95 | La_x86_64_xmm xmm[4]; | |
96 | -} La_x86_64_vector __attribute__ ((aligned(16))); | |
97 | +} La_x86_64_vector __attribute__ ((__aligned__(16))); | |
98 | ||
99 | typedef struct La_x86_64_regs | |
100 | { | |
101 | diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h | |
102 | --- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:03:05.109457627 -0500 | |
103 | +++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:06:58.434101818 -0500 | |
104 | @@ -20,14 +20,26 @@ | |
105 | ||
106 | #ifdef RESTORE_AVX | |
107 | /* This is to support AVX audit modules. */ | |
108 | - vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) | |
109 | - vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) | |
110 | - vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) | |
111 | - vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) | |
112 | - vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) | |
113 | - vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) | |
114 | - vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) | |
115 | - vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) | |
116 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
117 | + /* Restore AVX-512 registers. Use .byte becaues we lack assembler support. */ | |
118 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x44,0x24,0x03 # vmovdqu64 %zmm0,0xc0(%rsp) | |
119 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x04 # vmovdqu64 %zmm1,0x100(%rsp) | |
120 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x05 # vmovdqu64 %zmm2,0x140(%rsp) | |
121 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x06 # vmovdqu64 %zmm3,0x180(%rsp) | |
122 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x07 # vmovdqu64 %zmm4,0x1c0(%rsp) | |
123 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x08 # vmovdqu64 %zmm5,0x200(%rsp) | |
124 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x09 # vmovdqu64 %zmm6,0x240(%rsp) | |
125 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x0a # vmovdqu64 %zmm7,0x280(%rsp) | |
126 | +# else | |
127 | + VMOV %VEC(0), (LR_VECTOR_OFFSET)(%rsp) | |
128 | + VMOV %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) | |
129 | + VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) | |
130 | + VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) | |
131 | + VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) | |
132 | + VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) | |
133 | + VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) | |
134 | + VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) | |
135 | +# endif | |
136 | ||
137 | /* Save xmm0-xmm7 registers to detect if any of them are | |
138 | changed by audit module. */ | |
139 | @@ -73,7 +85,11 @@ | |
140 | je 2f | |
141 | vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp) | |
142 | jmp 1f | |
143 | -2: vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 | |
144 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
145 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x44,0x24,0x03 # vmovdqu64 0xc0(%rsp),%zmm0 | |
146 | +# else | |
147 | +2: VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0) | |
148 | +# endif | |
149 | vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp) | |
150 | ||
151 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 | |
152 | @@ -82,7 +98,11 @@ | |
153 | je 2f | |
154 | vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) | |
155 | jmp 1f | |
156 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 | |
157 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
158 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x04 # vmovdqu64 0x100(%rsp),%zmm1 | |
159 | +# else | |
160 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1) | |
161 | +# endif | |
162 | vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) | |
163 | ||
164 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 | |
165 | @@ -91,7 +111,11 @@ | |
166 | je 2f | |
167 | vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) | |
168 | jmp 1f | |
169 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 | |
170 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
171 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x05 # vmovdqu64 0x140(%rsp),%zmm2 | |
172 | +# else | |
173 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2) | |
174 | +# endif | |
175 | vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) | |
176 | ||
177 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 | |
178 | @@ -100,7 +124,11 @@ | |
179 | je 2f | |
180 | vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) | |
181 | jmp 1f | |
182 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 | |
183 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
184 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x06 # vmovdqu64 0x180(%rsp),%zmm3 | |
185 | +# else | |
186 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3) | |
187 | +# endif | |
188 | vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) | |
189 | ||
190 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 | |
191 | @@ -109,7 +137,11 @@ | |
192 | je 2f | |
193 | vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) | |
194 | jmp 1f | |
195 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 | |
196 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
197 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x07 # vmovdqu64 0x1c0(%rsp),%zmm4 | |
198 | +# else | |
199 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4) | |
200 | +# endif | |
201 | vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) | |
202 | ||
203 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 | |
204 | @@ -118,7 +150,11 @@ | |
205 | je 2f | |
206 | vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) | |
207 | jmp 1f | |
208 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 | |
209 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
210 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x08 # vmovdqu64 0x200(%rsp),%zmm5 | |
211 | +# else | |
212 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5) | |
213 | +# endif | |
214 | vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) | |
215 | ||
216 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 | |
217 | @@ -127,7 +163,11 @@ | |
218 | je 2f | |
219 | vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) | |
220 | jmp 1f | |
221 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 | |
222 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
223 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x09 # vmovdqu64 0x240(%rsp),%zmm6 | |
224 | +# else | |
225 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6) | |
226 | +# endif | |
227 | vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) | |
228 | ||
229 | 1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 | |
230 | @@ -136,7 +176,11 @@ | |
231 | je 2f | |
232 | vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) | |
233 | jmp 1f | |
234 | -2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 | |
235 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
236 | +2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x0a # vmovdqu64 0x280(%rsp),%zmm7 | |
237 | +# else | |
238 | +2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7) | |
239 | +# endif | |
240 | vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) | |
241 | ||
242 | 1: | |
243 | @@ -214,8 +258,13 @@ | |
244 | ||
245 | #ifdef RESTORE_AVX | |
246 | /* This is to support AVX audit modules. */ | |
247 | - vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) | |
248 | - vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) | |
249 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
250 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x81,0x50,0x00,0x00,0x00 # vmovdqu64 %zmm0,0x50(%rcx) | |
251 | + .byte 0x62,0xf1,0xfe,0x48,0x7f,0x89,0x90,0x00,0x00,0x00 # vmovdqu64 %zmm1,0x90(%rcx) | |
252 | +# else | |
253 | + VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx) | |
254 | + VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx) | |
255 | +# endif | |
256 | ||
257 | /* Save xmm0/xmm1 registers to detect if they are changed | |
258 | by audit module. */ | |
259 | @@ -244,13 +293,21 @@ | |
260 | vpmovmskb %xmm2, %esi | |
261 | cmpl $0xffff, %esi | |
262 | jne 1f | |
263 | - vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 | |
264 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
265 | + .byte 0x62,0xf1,0xfe,0x48,0x6f,0x84,0x24,0x50,0x00,0x00,0x00 # vmovdqu64 0x50(%rsp),%zmm0 | |
266 | +# else | |
267 | + VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0) | |
268 | +# endif | |
269 | ||
270 | 1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 | |
271 | vpmovmskb %xmm2, %esi | |
272 | cmpl $0xffff, %esi | |
273 | jne 1f | |
274 | - vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 | |
275 | +# if HAVE_NO_AVX512_ASM_SUPPORT | |
276 | + .byte 0x62,0xf1,0xfe,0x48,0x6f,0x8c,0x24,0x90,0x00,0x00,0x00 # vmovdqu64 0x90(%rsp),%zmm1 | |
277 | +# else | |
278 | + VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1) | |
279 | +# endif | |
280 | ||
281 | 1: | |
282 | #endif | |
283 | diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S | |
284 | --- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:03:05.108457659 -0500 | |
285 | +++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:07:31.799049953 -0500 | |
286 | @@ -134,7 +134,7 @@ | |
287 | .previous | |
288 | ||
289 | cmpl $0, L(have_avx)(%rip) | |
290 | - jne 1f | |
291 | + jne L(defined) | |
292 | movq %rbx, %r11 # Save rbx | |
293 | movl $1, %eax | |
294 | cpuid | |
295 | @@ -143,18 +143,51 @@ | |
296 | // AVX and XSAVE supported? | |
297 | andl $((1 << 28) | (1 << 27)), %ecx | |
298 | cmpl $((1 << 28) | (1 << 27)), %ecx | |
299 | - jne 2f | |
300 | + jne 10f | |
301 | + // AVX512 supported in processor? | |
302 | + movq %rbx, %r11 # Save rbx | |
303 | + xorl %ecx, %ecx | |
304 | + mov $0x7, %eax | |
305 | + cpuid | |
306 | + andl $(1 << 16), %ebx | |
307 | xorl %ecx, %ecx | |
308 | // Get XFEATURE_ENABLED_MASK | |
309 | xgetbv | |
310 | - andl $0x6, %eax | |
311 | -2: subl $0x5, %eax | |
312 | + test %ebx, %ebx | |
313 | + movq %r11, %rbx # Restore rbx | |
314 | + je 20f | |
315 | + // Verify that XCR0[7:5] = '111b' and | |
316 | + // XCR0[2:1] = '11b' which means | |
317 | + // that zmm state is enabled | |
318 | + andl $0xe6, %eax | |
319 | + cmpl $0xe6, %eax | |
320 | + jne 20f | |
321 | + movl %eax, L(have_avx)(%rip) | |
322 | +L(avx512): | |
323 | +# define RESTORE_AVX | |
324 | +# define HAVE_NO_AVX512_ASM_SUPPORT 1 | |
325 | +# define VMOV vmovdqu64 | |
326 | +# define VEC(i) zmm##i | |
327 | +# define MORE_CODE | |
328 | +# include "dl-trampoline.h" | |
329 | +# undef VMOV | |
330 | +# undef VEC | |
331 | +# undef RESTORE_AVX | |
332 | +# undef HAVE_NO_AVX512_ASM_SUPPORT | |
333 | +20: andl $0x6, %eax | |
334 | +10: subl $0x5, %eax | |
335 | movl %eax, L(have_avx)(%rip) | |
336 | cmpl $0, %eax | |
337 | ||
338 | -1: js L(no_avx) | |
339 | +L(defined): | |
340 | + js L(no_avx) | |
341 | + cmpl $0xe6, L(have_avx)(%rip) | |
342 | + je L(avx512) | |
343 | + | |
344 | ||
345 | # define RESTORE_AVX | |
346 | +# define VMOV vmovdqu | |
347 | +# define VEC(i) ymm##i | |
348 | # define MORE_CODE | |
349 | # include "dl-trampoline.h" | |
350 | ||
351 | @@ -178,7 +211,7 @@ | |
352 | _dl_x86_64_save_sse: | |
353 | # ifdef HAVE_AVX_SUPPORT | |
354 | cmpl $0, L(have_avx)(%rip) | |
355 | - jne 1f | |
356 | + jne L(defined_5) | |
357 | movq %rbx, %r11 # Save rbx | |
358 | movl $1, %eax | |
359 | cpuid | |
360 | @@ -187,21 +220,37 @@ | |
361 | // AVX and XSAVE supported? | |
362 | andl $((1 << 28) | (1 << 27)), %ecx | |
363 | cmpl $((1 << 28) | (1 << 27)), %ecx | |
364 | - jne 2f | |
365 | + jne 1f | |
366 | + // AVX512 supported in a processor? | |
367 | + movq %rbx, %r11 # Save rbx | |
368 | + xorl %ecx,%ecx | |
369 | + mov $0x7,%eax | |
370 | + cpuid | |
371 | + andl $(1 << 16), %ebx | |
372 | xorl %ecx, %ecx | |
373 | // Get XFEATURE_ENABLED_MASK | |
374 | xgetbv | |
375 | - andl $0x6, %eax | |
376 | - cmpl $0x6, %eax | |
377 | - // Nonzero if SSE and AVX state saving is enabled. | |
378 | - sete %al | |
379 | -2: leal -1(%eax,%eax), %eax | |
380 | + test %ebx, %ebx | |
381 | + movq %r11, %rbx # Restore rbx | |
382 | + je 2f | |
383 | + // Verify that XCR0[7:5] = '111b' and | |
384 | + // XCR0[2:1] = '11b' which means | |
385 | + // that zmm state is enabled | |
386 | + andl $0xe6, %eax | |
387 | movl %eax, L(have_avx)(%rip) | |
388 | - cmpl $0, %eax | |
389 | + cmpl $0xe6, %eax | |
390 | + je L(avx512_5) | |
391 | ||
392 | -1: js L(no_avx5) | |
393 | +2: andl $0x6, %eax | |
394 | +1: subl $0x5, %eax | |
395 | + movl %eax, L(have_avx)(%rip) | |
396 | + cmpl $0, %eax | |
397 | ||
398 | -# define YMM_SIZE 32 | |
399 | +L(defined_5): | |
400 | + js L(no_avx5) | |
401 | + cmpl $0xe6, L(have_avx)(%rip) | |
402 | + je L(avx512_5) | |
403 | + | |
404 | vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE | |
405 | vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE | |
406 | vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE | |
407 | @@ -211,6 +260,26 @@ | |
408 | vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE | |
409 | vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE | |
410 | ret | |
411 | +L(avx512_5): | |
412 | +# Original instructions: | |
413 | +# vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE | |
414 | +# vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE | |
415 | +# vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE | |
416 | +# vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE | |
417 | +# vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE | |
418 | +# vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE | |
419 | +# vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE | |
420 | +# vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE | |
421 | +# Assembled instructions: | |
422 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %zmm0,%fs:0x80 | |
423 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %zmm1,%fs:0xc0 | |
424 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %zmm2,%fs:0x100 | |
425 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %zmm3,%fs:0x140 | |
426 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %zmm4,%fs:0x180 | |
427 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %zmm5,%fs:0x1c0 | |
428 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %zmm6,%fs:0x200 | |
429 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %zmm7,%fs:0x240 | |
430 | + ret | |
431 | L(no_avx5): | |
432 | # endif | |
433 | movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE | |
434 | @@ -234,6 +303,8 @@ | |
435 | # ifdef HAVE_AVX_SUPPORT | |
436 | cmpl $0, L(have_avx)(%rip) | |
437 | js L(no_avx6) | |
438 | + cmpl $0xe6, L(have_avx)(%rip) | |
439 | + je L(avx512_6) | |
440 | ||
441 | vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0 | |
442 | vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1 | |
443 | @@ -244,6 +315,26 @@ | |
444 | vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6 | |
445 | vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7 | |
446 | ret | |
447 | +L(avx512_6): | |
448 | +# Original instructions: | |
449 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0 | |
450 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1 | |
451 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2 | |
452 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3 | |
453 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4 | |
454 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5 | |
455 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6 | |
456 | +# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7 | |
457 | +# Assembled instructions: | |
458 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %fs:0x80,%zmm0 | |
459 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %fs:0xc0,%zmm1 | |
460 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %fs:0x100,%zmm2 | |
461 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %fs:0x140,%zmm3 | |
462 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %fs:0x180,%zmm4 | |
463 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %fs:0x1c0,%zmm5 | |
464 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %fs:0x200,%zmm6 | |
465 | + .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %fs:0x240,%zmm7 | |
466 | + ret | |
467 | L(no_avx6): | |
468 | # endif | |
469 | movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0 | |
470 | diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym | |
471 | --- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym 2010-05-04 07:27:23.000000000 -0400 | |
472 | +++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym 2015-03-03 23:03:25.042829206 -0500 | |
473 | @@ -4,6 +4,8 @@ | |
474 | -- | |
475 | VECTOR_SIZE sizeof (La_x86_64_vector) | |
476 | XMM_SIZE sizeof (La_x86_64_xmm) | |
477 | +YMM_SIZE sizeof (La_x86_64_ymm) | |
478 | +ZMM_SIZE sizeof (La_x86_64_zmm) | |
479 | ||
480 | LR_SIZE sizeof (struct La_x86_64_regs) | |
481 | LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) |