]>
Commit | Line | Data |
---|---|---|
12788f63 MT |
1 | 2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com> |
2 | Ulrich Drepper <drepper@gmail.com> | |
3 | ||
4 | * sysdeps/x86_64/memset.S: After aligning destination, code | |
5 | branches to different locations depending on the value of | |
6 | misalignment, when multiarch is enabled. Fix this. | |
7 | ||
8 | 2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com> | |
9 | ||
10 | * sysdeps/x86_64/cacheinfo.c (init_cacheinfo): | |
11 | Set _x86_64_preferred_memory_instruction for AMD processsors. | |
12 | * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): | |
13 | Set bit_Prefer_SSE_for_memop for AMD processors. | |
14 | ||
15 | 2010-11-07 H.J. Lu <hongjiu.lu@intel.com> | |
16 | ||
17 | * sysdeps/x86_64/memset.S: Check USE_MULTIARCH and USE_SSE2 for | |
18 | IFUNC support. | |
19 | * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add | |
20 | memset-x86-64. | |
21 | * sysdeps/x86_64/multiarch/bzero.S: New file. | |
22 | * sysdeps/x86_64/multiarch/cacheinfo.c: New file. | |
23 | * sysdeps/x86_64/multiarch/memset-x86-64.S: New file. | |
24 | * sysdeps/x86_64/multiarch/memset.S: New file. | |
25 | * sysdeps/x86_64/multiarch/memset_chk.S: New file. | |
26 | * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): | |
27 | Set bit_Prefer_SSE_for_memop for Intel processors. | |
28 | * sysdeps/x86_64/multiarch/init-arch.h (bit_Prefer_SSE_for_memop): | |
29 | Define. | |
30 | (index_Prefer_SSE_for_memop): Define. | |
31 | (HAS_PREFER_SSE_FOR_MEMOP): Define. | |
32 | ||
33 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/cacheinfo.c | |
34 | =================================================================== | |
35 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/cacheinfo.c | |
36 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/cacheinfo.c | |
37 | @@ -613,6 +613,25 @@ init_cacheinfo (void) | |
38 | long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); | |
39 | shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); | |
40 | ||
41 | +#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION | |
42 | +# ifdef USE_MULTIARCH | |
43 | + eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; | |
44 | + ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx; | |
45 | + ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; | |
46 | + edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx; | |
47 | +# else | |
48 | + __cpuid (1, eax, ebx, ecx, edx); | |
49 | +# endif | |
50 | + | |
51 | + /* AMD prefers SSSE3 instructions for memory/string routines | |
52 | + if they are avaiable, otherwise it prefers integer | |
53 | + instructions. */ | |
54 | + if ((ecx & 0x200)) | |
55 | + __x86_64_preferred_memory_instruction = 3; | |
56 | + else | |
57 | + __x86_64_preferred_memory_instruction = 0; | |
58 | +#endif | |
59 | + | |
60 | /* Get maximum extended function. */ | |
61 | __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); | |
62 | ||
63 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/memset.S | |
64 | =================================================================== | |
65 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/memset.S | |
66 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/memset.S | |
67 | @@ -24,7 +24,7 @@ | |
68 | #define __STOS_UPPER_BOUNDARY $65536 | |
69 | ||
70 | .text | |
71 | -#ifndef NOT_IN_libc | |
72 | +#if !defined NOT_IN_libc && !defined USE_MULTIARCH | |
73 | ENTRY(__bzero) | |
74 | mov %rsi,%rdx /* Adjust parameter. */ | |
75 | xorl %esi,%esi /* Fill with 0s. */ | |
76 | @@ -34,10 +34,10 @@ weak_alias (__bzero, bzero) | |
77 | #endif | |
78 | ||
79 | #if defined PIC && !defined NOT_IN_libc | |
80 | -ENTRY (__memset_chk) | |
81 | +ENTRY_CHK (__memset_chk) | |
82 | cmpq %rdx, %rcx | |
83 | jb HIDDEN_JUMPTARGET (__chk_fail) | |
84 | -END (__memset_chk) | |
85 | +END_CHK (__memset_chk) | |
86 | #endif | |
87 | ENTRY (memset) | |
88 | L(memset_entry): | |
89 | @@ -591,157 +591,15 @@ L(A6Q1): mov %dx,-0xe(%rdi) | |
90 | L(A7Q0): mov %dl,-0x7(%rdi) | |
91 | L(A6Q0): mov %dx,-0x6(%rdi) | |
92 | mov %edx,-0x4(%rdi) | |
93 | - jmp L(aligned_now) | |
94 | - | |
95 | - .balign 16 | |
96 | -L(aligned_now): | |
97 | - | |
98 | - cmpl $0x1,__x86_64_preferred_memory_instruction(%rip) | |
99 | - jg L(SSE_pre) | |
100 | - | |
101 | -L(8byte_move_try): | |
102 | - cmpq __STOS_LOWER_BOUNDARY,%r8 | |
103 | - jae L(8byte_stos_try) | |
104 | - | |
105 | - .balign 16 | |
106 | -L(8byte_move): | |
107 | - movq %r8,%rcx | |
108 | - shrq $7,%rcx | |
109 | - jz L(8byte_move_skip) | |
110 | - | |
111 | - .p2align 4 | |
112 | - | |
113 | -L(8byte_move_loop): | |
114 | - decq %rcx | |
115 | - | |
116 | - movq %rdx, (%rdi) | |
117 | - movq %rdx, 8 (%rdi) | |
118 | - movq %rdx, 16 (%rdi) | |
119 | - movq %rdx, 24 (%rdi) | |
120 | - movq %rdx, 32 (%rdi) | |
121 | - movq %rdx, 40 (%rdi) | |
122 | - movq %rdx, 48 (%rdi) | |
123 | - movq %rdx, 56 (%rdi) | |
124 | - movq %rdx, 64 (%rdi) | |
125 | - movq %rdx, 72 (%rdi) | |
126 | - movq %rdx, 80 (%rdi) | |
127 | - movq %rdx, 88 (%rdi) | |
128 | - movq %rdx, 96 (%rdi) | |
129 | - movq %rdx, 104 (%rdi) | |
130 | - movq %rdx, 112 (%rdi) | |
131 | - movq %rdx, 120 (%rdi) | |
132 | - | |
133 | - leaq 128 (%rdi),%rdi | |
134 | - | |
135 | - jnz L(8byte_move_loop) | |
136 | - | |
137 | -L(8byte_move_skip): | |
138 | - andl $127,%r8d | |
139 | - lea (%rdi,%r8,1),%rdi | |
140 | - | |
141 | -#ifndef PIC | |
142 | - lea L(setPxQx)(%rip),%r11 | |
143 | - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
144 | -#else | |
145 | - lea L(Got0)(%rip),%r11 | |
146 | - lea L(setPxQx)(%rip),%rcx | |
147 | - movswq (%rcx,%r8,2),%rcx | |
148 | - lea (%rcx,%r11,1),%r11 | |
149 | - jmpq *%r11 | |
150 | -#endif | |
151 | - | |
152 | - .balign 16 | |
153 | -L(8byte_stos_try): | |
154 | - mov __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size | |
155 | - cmpq %r8,%r9 // calculate the lesser of remaining | |
156 | - cmovaq %r8,%r9 // bytes and largest cache size | |
157 | - jbe L(8byte_stos) | |
158 | - | |
159 | -L(8byte_move_reuse_try): | |
160 | - cmp __STOS_UPPER_BOUNDARY,%r8 | |
161 | - jae L(8byte_move) | |
162 | - | |
163 | - .balign 16 | |
164 | -L(8byte_stos): | |
165 | - movq %r9,%rcx | |
166 | - andq $-8,%r9 | |
167 | - | |
168 | - shrq $3,%rcx | |
169 | - jz L(8byte_stos_skip) | |
170 | - | |
171 | - xchgq %rax,%rdx | |
172 | - | |
173 | - rep | |
174 | - stosq | |
175 | - | |
176 | - xchgq %rax,%rdx | |
177 | - | |
178 | -L(8byte_stos_skip): | |
179 | - subq %r9,%r8 | |
180 | - ja L(8byte_nt_move) | |
181 | - | |
182 | - andl $7,%r8d | |
183 | - lea (%rdi,%r8,1),%rdi | |
184 | -#ifndef PIC | |
185 | - lea L(setPxQx)(%rip),%r11 | |
186 | - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
187 | -#else | |
188 | - lea L(Got0)(%rip),%r11 | |
189 | - lea L(setPxQx)(%rip),%rcx | |
190 | - movswq (%rcx,%r8,2),%rcx | |
191 | - lea (%rcx,%r11,1),%r11 | |
192 | - jmpq *%r11 | |
193 | -#endif | |
194 | ||
195 | - .balign 16 | |
196 | -L(8byte_nt_move): | |
197 | - movq %r8,%rcx | |
198 | - shrq $7,%rcx | |
199 | - jz L(8byte_nt_move_skip) | |
200 | - | |
201 | - .balign 16 | |
202 | -L(8byte_nt_move_loop): | |
203 | - decq %rcx | |
204 | - | |
205 | - movntiq %rdx, (%rdi) | |
206 | - movntiq %rdx, 8 (%rdi) | |
207 | - movntiq %rdx, 16 (%rdi) | |
208 | - movntiq %rdx, 24 (%rdi) | |
209 | - movntiq %rdx, 32 (%rdi) | |
210 | - movntiq %rdx, 40 (%rdi) | |
211 | - movntiq %rdx, 48 (%rdi) | |
212 | - movntiq %rdx, 56 (%rdi) | |
213 | - movntiq %rdx, 64 (%rdi) | |
214 | - movntiq %rdx, 72 (%rdi) | |
215 | - movntiq %rdx, 80 (%rdi) | |
216 | - movntiq %rdx, 88 (%rdi) | |
217 | - movntiq %rdx, 96 (%rdi) | |
218 | - movntiq %rdx, 104 (%rdi) | |
219 | - movntiq %rdx, 112 (%rdi) | |
220 | - movntiq %rdx, 120 (%rdi) | |
221 | - | |
222 | - leaq 128 (%rdi),%rdi | |
223 | - | |
224 | - jnz L(8byte_nt_move_loop) | |
225 | - | |
226 | - sfence | |
227 | - | |
228 | -L(8byte_nt_move_skip): | |
229 | - andl $127,%r8d | |
230 | +#ifndef USE_MULTIARCH | |
231 | + jmp L(aligned_now) | |
232 | ||
233 | - lea (%rdi,%r8,1),%rdi | |
234 | -#ifndef PIC | |
235 | - lea L(setPxQx)(%rip),%r11 | |
236 | - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
237 | +L(SSE_pre): | |
238 | #else | |
239 | - lea L(Got0)(%rip),%r11 | |
240 | - lea L(setPxQx)(%rip),%rcx | |
241 | - movswq (%rcx,%r8,2),%rcx | |
242 | - lea (%rcx,%r11,1),%r11 | |
243 | - jmpq *%r11 | |
244 | +L(aligned_now): | |
245 | #endif | |
246 | - | |
247 | -L(SSE_pre): | |
248 | +#if !defined USE_MULTIARCH || defined USE_SSE2 | |
249 | # fill RegXMM0 with the pattern | |
250 | movd %rdx,%xmm0 | |
251 | punpcklqdq %xmm0,%xmm0 | |
252 | @@ -1342,11 +1200,162 @@ L(SSExDx): | |
253 | .short L(SSE15QB)-L(SSE0Q0) | |
254 | #endif | |
255 | .popsection | |
256 | +#endif /* !defined USE_MULTIARCH || defined USE_SSE2 */ | |
257 | + | |
258 | + .balign 16 | |
259 | +#ifndef USE_MULTIARCH | |
260 | +L(aligned_now): | |
261 | + | |
262 | + cmpl $0x1,__x86_64_preferred_memory_instruction(%rip) | |
263 | + jg L(SSE_pre) | |
264 | +#endif /* USE_MULTIARCH */ | |
265 | + | |
266 | +L(8byte_move_try): | |
267 | + cmpq __STOS_LOWER_BOUNDARY,%r8 | |
268 | + jae L(8byte_stos_try) | |
269 | + | |
270 | + .balign 16 | |
271 | +L(8byte_move): | |
272 | + movq %r8,%rcx | |
273 | + shrq $7,%rcx | |
274 | + jz L(8byte_move_skip) | |
275 | + | |
276 | + .p2align 4 | |
277 | + | |
278 | +L(8byte_move_loop): | |
279 | + decq %rcx | |
280 | + | |
281 | + movq %rdx, (%rdi) | |
282 | + movq %rdx, 8 (%rdi) | |
283 | + movq %rdx, 16 (%rdi) | |
284 | + movq %rdx, 24 (%rdi) | |
285 | + movq %rdx, 32 (%rdi) | |
286 | + movq %rdx, 40 (%rdi) | |
287 | + movq %rdx, 48 (%rdi) | |
288 | + movq %rdx, 56 (%rdi) | |
289 | + movq %rdx, 64 (%rdi) | |
290 | + movq %rdx, 72 (%rdi) | |
291 | + movq %rdx, 80 (%rdi) | |
292 | + movq %rdx, 88 (%rdi) | |
293 | + movq %rdx, 96 (%rdi) | |
294 | + movq %rdx, 104 (%rdi) | |
295 | + movq %rdx, 112 (%rdi) | |
296 | + movq %rdx, 120 (%rdi) | |
297 | + | |
298 | + leaq 128 (%rdi),%rdi | |
299 | + | |
300 | + jnz L(8byte_move_loop) | |
301 | + | |
302 | +L(8byte_move_skip): | |
303 | + andl $127,%r8d | |
304 | + lea (%rdi,%r8,1),%rdi | |
305 | + | |
306 | +#ifndef PIC | |
307 | + lea L(setPxQx)(%rip),%r11 | |
308 | + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
309 | +#else | |
310 | + lea L(Got0)(%rip),%r11 | |
311 | + lea L(setPxQx)(%rip),%rcx | |
312 | + movswq (%rcx,%r8,2),%rcx | |
313 | + lea (%rcx,%r11,1),%r11 | |
314 | + jmpq *%r11 | |
315 | +#endif | |
316 | + | |
317 | + .balign 16 | |
318 | +L(8byte_stos_try): | |
319 | + mov __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size | |
320 | + cmpq %r8,%r9 // calculate the lesser of remaining | |
321 | + cmovaq %r8,%r9 // bytes and largest cache size | |
322 | + jbe L(8byte_stos) | |
323 | + | |
324 | +L(8byte_move_reuse_try): | |
325 | + cmp __STOS_UPPER_BOUNDARY,%r8 | |
326 | + jae L(8byte_move) | |
327 | + | |
328 | + .balign 16 | |
329 | +L(8byte_stos): | |
330 | + movq %r9,%rcx | |
331 | + andq $-8,%r9 | |
332 | + | |
333 | + shrq $3,%rcx | |
334 | + jz L(8byte_stos_skip) | |
335 | + | |
336 | + xchgq %rax,%rdx | |
337 | + | |
338 | + rep | |
339 | + stosq | |
340 | + | |
341 | + xchgq %rax,%rdx | |
342 | + | |
343 | +L(8byte_stos_skip): | |
344 | + subq %r9,%r8 | |
345 | + ja L(8byte_nt_move) | |
346 | + | |
347 | + andl $7,%r8d | |
348 | + lea (%rdi,%r8,1),%rdi | |
349 | +#ifndef PIC | |
350 | + lea L(setPxQx)(%rip),%r11 | |
351 | + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
352 | +#else | |
353 | + lea L(Got0)(%rip),%r11 | |
354 | + lea L(setPxQx)(%rip),%rcx | |
355 | + movswq (%rcx,%r8,2),%rcx | |
356 | + lea (%rcx,%r11,1),%r11 | |
357 | + jmpq *%r11 | |
358 | +#endif | |
359 | + | |
360 | + .balign 16 | |
361 | +L(8byte_nt_move): | |
362 | + movq %r8,%rcx | |
363 | + shrq $7,%rcx | |
364 | + jz L(8byte_nt_move_skip) | |
365 | + | |
366 | + .balign 16 | |
367 | +L(8byte_nt_move_loop): | |
368 | + decq %rcx | |
369 | + | |
370 | + movntiq %rdx, (%rdi) | |
371 | + movntiq %rdx, 8 (%rdi) | |
372 | + movntiq %rdx, 16 (%rdi) | |
373 | + movntiq %rdx, 24 (%rdi) | |
374 | + movntiq %rdx, 32 (%rdi) | |
375 | + movntiq %rdx, 40 (%rdi) | |
376 | + movntiq %rdx, 48 (%rdi) | |
377 | + movntiq %rdx, 56 (%rdi) | |
378 | + movntiq %rdx, 64 (%rdi) | |
379 | + movntiq %rdx, 72 (%rdi) | |
380 | + movntiq %rdx, 80 (%rdi) | |
381 | + movntiq %rdx, 88 (%rdi) | |
382 | + movntiq %rdx, 96 (%rdi) | |
383 | + movntiq %rdx, 104 (%rdi) | |
384 | + movntiq %rdx, 112 (%rdi) | |
385 | + movntiq %rdx, 120 (%rdi) | |
386 | + | |
387 | + leaq 128 (%rdi),%rdi | |
388 | + | |
389 | + jnz L(8byte_nt_move_loop) | |
390 | + | |
391 | + sfence | |
392 | + | |
393 | +L(8byte_nt_move_skip): | |
394 | + andl $127,%r8d | |
395 | + | |
396 | + lea (%rdi,%r8,1),%rdi | |
397 | +#ifndef PIC | |
398 | + lea L(setPxQx)(%rip),%r11 | |
399 | + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC | |
400 | +#else | |
401 | + lea L(Got0)(%rip),%r11 | |
402 | + lea L(setPxQx)(%rip),%rcx | |
403 | + movswq (%rcx,%r8,2),%rcx | |
404 | + lea (%rcx,%r11,1),%r11 | |
405 | + jmpq *%r11 | |
406 | +#endif | |
407 | ||
408 | END (memset) | |
409 | libc_hidden_builtin_def (memset) | |
410 | ||
411 | -#if defined PIC && !defined NOT_IN_libc | |
412 | +#if defined PIC && !defined NOT_IN_libc && !defined USE_MULTIARCH | |
413 | strong_alias (__memset_chk, __memset_zero_constant_len_parameter) | |
414 | .section .gnu.warning.__memset_zero_constant_len_parameter | |
415 | .string "memset used with constant zero length parameter; this could be due to transposed parameters" | |
416 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/Makefile | |
417 | =================================================================== | |
418 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/Makefile | |
419 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/Makefile | |
420 | @@ -7,7 +7,8 @@ ifeq ($(subdir),string) | |
421 | sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ | |
422 | strend-sse4 memcmp-sse4 \ | |
423 | strcasestr-nonascii strcasecmp_l-ssse3 \ | |
424 | - strncase_l-ssse3 | |
425 | + strncase_l-ssse3 \ | |
426 | + memset-x86-64 | |
427 | ifeq (yes,$(config-cflags-sse4)) | |
428 | sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c | |
429 | CFLAGS-strcspn-c.c += -msse4 | |
430 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/bzero.S | |
431 | =================================================================== | |
432 | --- /dev/null | |
433 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/bzero.S | |
434 | @@ -0,0 +1,56 @@ | |
435 | +/* Multiple versions of bzero | |
436 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
437 | + This file is part of the GNU C Library. | |
438 | + | |
439 | + The GNU C Library is free software; you can redistribute it and/or | |
440 | + modify it under the terms of the GNU Lesser General Public | |
441 | + License as published by the Free Software Foundation; either | |
442 | + version 2.1 of the License, or (at your option) any later version. | |
443 | + | |
444 | + The GNU C Library is distributed in the hope that it will be useful, | |
445 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
446 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
447 | + Lesser General Public License for more details. | |
448 | + | |
449 | + You should have received a copy of the GNU Lesser General Public | |
450 | + License along with the GNU C Library; if not, write to the Free | |
451 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
452 | + 02111-1307 USA. */ | |
453 | + | |
454 | +#include <sysdep.h> | |
455 | +#include <init-arch.h> | |
456 | + | |
457 | + .text | |
458 | +ENTRY(__bzero) | |
459 | + .type __bzero, @gnu_indirect_function | |
460 | + cmpl $0, __cpu_features+KIND_OFFSET(%rip) | |
461 | + jne 1f | |
462 | + call __init_cpu_features | |
463 | +1: leaq __bzero_x86_64(%rip), %rax | |
464 | + testl $bit_Prefer_SSE_for_memop, __cpu_features+FEATURE_OFFSET+index_Prefer_SSE_for_memop(%rip) | |
465 | + jz 2f | |
466 | + leaq __bzero_sse2(%rip), %rax | |
467 | +2: ret | |
468 | +END(__bzero) | |
469 | + | |
470 | + .type __bzero_sse2, @function | |
471 | +__bzero_sse2: | |
472 | + cfi_startproc | |
473 | + CALL_MCOUNT | |
474 | + mov %rsi,%rdx /* Adjust parameter. */ | |
475 | + xorl %esi,%esi /* Fill with 0s. */ | |
476 | + jmp __memset_sse2 | |
477 | + cfi_endproc | |
478 | + .size __bzero_sse2, .-__bzero_sse2 | |
479 | + | |
480 | + .type __bzero_x86_64, @function | |
481 | +__bzero_x86_64: | |
482 | + cfi_startproc | |
483 | + CALL_MCOUNT | |
484 | + mov %rsi,%rdx /* Adjust parameter. */ | |
485 | + xorl %esi,%esi /* Fill with 0s. */ | |
486 | + jmp __memset_x86_64 | |
487 | + cfi_endproc | |
488 | + .size __bzero_x86_64, .-__bzero_x86_64 | |
489 | + | |
490 | +weak_alias (__bzero, bzero) | |
491 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/cacheinfo.c | |
492 | =================================================================== | |
493 | --- /dev/null | |
494 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/cacheinfo.c | |
495 | @@ -0,0 +1,2 @@ | |
496 | +#define DISABLE_PREFERRED_MEMORY_INSTRUCTION | |
497 | +#include "../cacheinfo.c" | |
498 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c | |
499 | =================================================================== | |
500 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/init-arch.c | |
501 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c | |
502 | @@ -59,6 +59,11 @@ __init_cpu_features (void) | |
503 | ||
504 | get_common_indeces (&family, &model); | |
505 | ||
506 | + /* Intel processors prefer SSE instruction for memory/string | |
507 | + routines if they are avaiable. */ | |
508 | + __cpu_features.feature[index_Prefer_SSE_for_memop] | |
509 | + |= bit_Prefer_SSE_for_memop; | |
510 | + | |
511 | unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; | |
512 | unsigned int extended_family = (eax >> 20) & 0xff; | |
513 | unsigned int extended_model = (eax >> 12) & 0xf0; | |
514 | @@ -92,6 +97,14 @@ __init_cpu_features (void) | |
515 | kind = arch_kind_amd; | |
516 | ||
517 | get_common_indeces (&family, &model); | |
518 | + | |
519 | + unsigned int ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; | |
520 | + | |
521 | + /* AMD processors prefer SSE instructions for memory/string routines | |
522 | + if they are available, otherwise they prefer integer instructions. */ | |
523 | + if ((ecx & 0x200)) | |
524 | + __cpu_features.feature[index_Prefer_SSE_for_memop] | |
525 | + |= bit_Prefer_SSE_for_memop; | |
526 | } | |
527 | else | |
528 | kind = arch_kind_other; | |
529 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.h | |
530 | =================================================================== | |
531 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/init-arch.h | |
532 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.h | |
533 | @@ -16,7 +16,8 @@ | |
534 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
535 | 02111-1307 USA. */ | |
536 | ||
537 | -#define bit_Fast_Rep_String (1 << 0) | |
538 | +#define bit_Fast_Rep_String (1 << 0) | |
539 | +#define bit_Prefer_SSE_for_memop (1 << 3) | |
540 | ||
541 | #ifdef __ASSEMBLER__ | |
542 | ||
543 | @@ -33,6 +34,7 @@ | |
544 | # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET | |
545 | ||
546 | #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE | |
547 | +# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE | |
548 | ||
549 | #else /* __ASSEMBLER__ */ | |
550 | ||
551 | @@ -103,5 +105,12 @@ extern const struct cpu_features *__get_ | |
552 | # define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) | |
553 | ||
554 | # define index_Fast_Rep_String FEATURE_INDEX_1 | |
555 | +# define index_Prefer_SSE_for_memop FEATURE_INDEX_1 | |
556 | + | |
557 | +#define HAS_ARCH_FEATURE(idx, bit) \ | |
558 | + ((__get_cpu_features ()->feature[idx] & (bit)) != 0) | |
559 | + | |
560 | +#define HAS_PREFER_SSE_FOR_MEMOP \ | |
561 | + HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop) | |
562 | ||
563 | #endif /* __ASSEMBLER__ */ | |
564 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset-x86-64.S | |
565 | =================================================================== | |
566 | --- /dev/null | |
567 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset-x86-64.S | |
568 | @@ -0,0 +1,18 @@ | |
569 | +#include <sysdep.h> | |
570 | + | |
571 | +#ifndef NOT_IN_libc | |
572 | +# undef ENTRY_CHK | |
573 | +# define ENTRY_CHK(name) \ | |
574 | + .type __memset_chk_x86_64, @function; \ | |
575 | + .globl __memset_chk_x86_64; \ | |
576 | + .p2align 4; \ | |
577 | + __memset_chk_x86_64: cfi_startproc; \ | |
578 | + CALL_MCOUNT | |
579 | +# undef END_CHK | |
580 | +# define END_CHK(name) \ | |
581 | + cfi_endproc; .size __memset_chk_x86_64, .-__memset_chk_x86_64 | |
582 | + | |
583 | +# define libc_hidden_builtin_def(name) | |
584 | +# define memset __memset_x86_64 | |
585 | +# include "../memset.S" | |
586 | +#endif | |
587 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset.S | |
588 | =================================================================== | |
589 | --- /dev/null | |
590 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset.S | |
591 | @@ -0,0 +1,74 @@ | |
592 | +/* Multiple versions of memset | |
593 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
594 | + This file is part of the GNU C Library. | |
595 | + | |
596 | + The GNU C Library is free software; you can redistribute it and/or | |
597 | + modify it under the terms of the GNU Lesser General Public | |
598 | + License as published by the Free Software Foundation; either | |
599 | + version 2.1 of the License, or (at your option) any later version. | |
600 | + | |
601 | + The GNU C Library is distributed in the hope that it will be useful, | |
602 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
603 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
604 | + Lesser General Public License for more details. | |
605 | + | |
606 | + You should have received a copy of the GNU Lesser General Public | |
607 | + License along with the GNU C Library; if not, write to the Free | |
608 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
609 | + 02111-1307 USA. */ | |
610 | + | |
611 | +#include <sysdep.h> | |
612 | +#include <init-arch.h> | |
613 | + | |
614 | +/* Define multiple versions only for the definition in lib. */ | |
615 | +#ifndef NOT_IN_libc | |
616 | +ENTRY(memset) | |
617 | + .type memset, @gnu_indirect_function | |
618 | + cmpl $0, __cpu_features+KIND_OFFSET(%rip) | |
619 | + jne 1f | |
620 | + call __init_cpu_features | |
621 | +1: leaq __memset_x86_64(%rip), %rax | |
622 | + testl $bit_Prefer_SSE_for_memop, __cpu_features+FEATURE_OFFSET+index_Prefer_SSE_for_memop(%rip) | |
623 | + jz 2f | |
624 | + leaq __memset_sse2(%rip), %rax | |
625 | +2: ret | |
626 | +END(memset) | |
627 | + | |
628 | +# define USE_SSE2 1 | |
629 | + | |
630 | +# undef ENTRY | |
631 | +# define ENTRY(name) \ | |
632 | + .type __memset_sse2, @function; \ | |
633 | + .globl __memset_sse2; \ | |
634 | + .p2align 4; \ | |
635 | + __memset_sse2: cfi_startproc; \ | |
636 | + CALL_MCOUNT | |
637 | +# undef END | |
638 | +# define END(name) \ | |
639 | + cfi_endproc; .size __memset_sse2, .-__memset_sse2 | |
640 | + | |
641 | +# undef ENTRY_CHK | |
642 | +# define ENTRY_CHK(name) \ | |
643 | + .type __memset_chk_sse2, @function; \ | |
644 | + .globl __memset_chk_sse2; \ | |
645 | + .p2align 4; \ | |
646 | + __memset_chk_sse2: cfi_startproc; \ | |
647 | + CALL_MCOUNT | |
648 | +# undef END_CHK | |
649 | +# define END_CHK(name) \ | |
650 | + cfi_endproc; .size __memset_chk_sse2, .-__memset_chk_sse2 | |
651 | + | |
652 | +# ifdef SHARED | |
653 | +# undef libc_hidden_builtin_def | |
654 | +/* It doesn't make sense to send libc-internal memset calls through a PLT. | |
655 | + The speedup we get from using GPR instruction is likely eaten away | |
656 | + by the indirect call in the PLT. */ | |
657 | +# define libc_hidden_builtin_def(name) \ | |
658 | + .globl __GI_memset; __GI_memset = __memset_sse2 | |
659 | +# endif | |
660 | + | |
661 | +# undef strong_alias | |
662 | +# define strong_alias(original, alias) | |
663 | +#endif | |
664 | + | |
665 | +#include "../memset.S" | |
666 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset_chk.S | |
667 | =================================================================== | |
668 | --- /dev/null | |
669 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/memset_chk.S | |
670 | @@ -0,0 +1,44 @@ | |
671 | +/* Multiple versions of __memset_chk | |
672 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
673 | + This file is part of the GNU C Library. | |
674 | + | |
675 | + The GNU C Library is free software; you can redistribute it and/or | |
676 | + modify it under the terms of the GNU Lesser General Public | |
677 | + License as published by the Free Software Foundation; either | |
678 | + version 2.1 of the License, or (at your option) any later version. | |
679 | + | |
680 | + The GNU C Library is distributed in the hope that it will be useful, | |
681 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
682 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
683 | + Lesser General Public License for more details. | |
684 | + | |
685 | + You should have received a copy of the GNU Lesser General Public | |
686 | + License along with the GNU C Library; if not, write to the Free | |
687 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
688 | + 02111-1307 USA. */ | |
689 | + | |
690 | +#include <sysdep.h> | |
691 | +#include <init-arch.h> | |
692 | + | |
693 | +/* Define multiple versions only for the definition in lib. */ | |
694 | +#ifndef NOT_IN_libc | |
695 | +# ifdef SHARED | |
696 | +ENTRY(__memset_chk) | |
697 | + .type __memset_chk, @gnu_indirect_function | |
698 | + cmpl $0, __cpu_features+KIND_OFFSET(%rip) | |
699 | + jne 1f | |
700 | + call __init_cpu_features | |
701 | +1: leaq __memset_chk_x86_64(%rip), %rax | |
702 | + testl $bit_Prefer_SSE_for_memop, __cpu_features+FEATURE_OFFSET+index_Prefer_SSE_for_memop(%rip) | |
703 | + jz 2f | |
704 | + leaq __memset_chk_sse2(%rip), %rax | |
705 | +2: ret | |
706 | +END(__memset_chk) | |
707 | + | |
708 | +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) | |
709 | + .section .gnu.warning.__memset_zero_constant_len_parameter | |
710 | + .string "memset used with constant zero length parameter; this could be due to transposed parameters" | |
711 | +# else | |
712 | +# include "../memset_chk.S" | |
713 | +# endif | |
714 | +#endif |