]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/glibc/glibc-rh695963.patch
Merge remote-tracking branch 'origin/next' into thirteen
[people/pmueller/ipfire-2.x.git] / src / patches / glibc / glibc-rh695963.patch
1 2010-08-27 Ulrich Drepper <drepper@redhat.com>
2
3 * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Move to .text.slow section.
4
5 * sysdeps/x86_64/strlen.S: Minimal code improvement.
6
7 2010-08-26 H.J. Lu <hongjiu.lu@intel.com>
8
9 * sysdeps/x86_64/strlen.S: Unroll the loop.
10 * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
11 strlen-sse2 strlen-sse2-bsf.
12 * sysdeps/x86_64/multiarch/strlen.S ((strlen): Return
13 __strlen_no_bsf if bit_Slow_BSF is set.
14 (__strlen_sse42): Removed.
15 * sysdeps/x86_64/multiarch/strlen-no-bsf.S: New file.
16 * sysdeps/x86_64/multiarch/strlen-sse4.S: New file.
17
18 2010-08-25 H.J. Lu <hongjiu.lu@intel.com>
19
20 * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
21 strlen-sse2 strlen-sse2-bsf.
22 * sysdeps/i386/i686/multiarch/strlen.S (strlen): Return
23 __strlen_sse2_bsf if bit_Slow_BSF is unset.
24 (__strlen_sse2): Removed.
25 * sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S: New file.
26 * sysdeps/i386/i686/multiarch/strlen-sse2.S: New file.
27 * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set
28 bit_Slow_BSF for Atom.
29 * sysdeps/x86_64/multiarch/init-arch.h (bit_Slow_BSF): Define.
30 (index_Slow_BSF): Define.
31 (HAS_SLOW_BSF): Define.
32
33 Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile
34 ===================================================================
35 --- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/Makefile
36 +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile
37 @@ -9,7 +9,8 @@ sysdep_routines += bzero-sse2 memset-sse
38 memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
39 memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
40 strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
41 - memcmp-ssse3 memcmp-sse4 strcasestr-nonascii
42 + memcmp-ssse3 memcmp-sse4 strcasestr-nonascii \
43 + strlen-sse2 strlen-sse2-bsf
44 ifeq (yes,$(config-cflags-sse4))
45 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
46 CFLAGS-strcspn-c.c += -msse4
47 Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
48 ===================================================================
49 --- /dev/null
50 +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
51 @@ -0,0 +1,127 @@
52 +/* strlen with SSE2 and BSF
53 + Copyright (C) 2010 Free Software Foundation, Inc.
54 + Contributed by Intel Corporation.
55 + This file is part of the GNU C Library.
56 +
57 + The GNU C Library is free software; you can redistribute it and/or
58 + modify it under the terms of the GNU Lesser General Public
59 + License as published by the Free Software Foundation; either
60 + version 2.1 of the License, or (at your option) any later version.
61 +
62 + The GNU C Library is distributed in the hope that it will be useful,
63 + but WITHOUT ANY WARRANTY; without even the implied warranty of
64 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
65 + Lesser General Public License for more details.
66 +
67 + You should have received a copy of the GNU Lesser General Public
68 + License along with the GNU C Library; if not, write to the Free
69 + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
70 + 02111-1307 USA. */
71 +
72 +#if defined SHARED && !defined NOT_IN_libc
73 +
74 +#include <sysdep.h>
75 +#include "asm-syntax.h"
76 +
77 +#define CFI_PUSH(REG) \
78 + cfi_adjust_cfa_offset (4); \
79 + cfi_rel_offset (REG, 0)
80 +
81 +#define CFI_POP(REG) \
82 + cfi_adjust_cfa_offset (-4); \
83 + cfi_restore (REG)
84 +
85 +#define PUSH(REG) pushl REG; CFI_PUSH (REG)
86 +#define POP(REG) popl REG; CFI_POP (REG)
87 +#define PARMS 4 + 8 /* Preserve ESI and EDI. */
88 +#define STR PARMS
89 +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state
90 +#define RETURN POP (%edi); POP (%esi); ret; \
91 + cfi_restore_state; cfi_remember_state
92 +
93 + .text
94 +ENTRY ( __strlen_sse2_bsf)
95 + ENTRANCE
96 + mov STR(%esp), %edi
97 + xor %eax, %eax
98 + mov %edi, %ecx
99 + and $0x3f, %ecx
100 + pxor %xmm0, %xmm0
101 + cmp $0x30, %ecx
102 + ja L(next)
103 + movdqu (%edi), %xmm1
104 + pcmpeqb %xmm1, %xmm0
105 + pmovmskb %xmm0, %edx
106 + test %edx, %edx
107 + jnz L(exit_less16)
108 + mov %edi, %eax
109 + and $-16, %eax
110 + jmp L(align16_start)
111 +L(next):
112 +
113 + mov %edi, %eax
114 + and $-16, %eax
115 + pcmpeqb (%eax), %xmm0
116 + mov $-1, %esi
117 + sub %eax, %ecx
118 + shl %cl, %esi
119 + pmovmskb %xmm0, %edx
120 + and %esi, %edx
121 + jnz L(exit)
122 +L(align16_start):
123 + pxor %xmm0, %xmm0
124 + pxor %xmm1, %xmm1
125 + pxor %xmm2, %xmm2
126 + pxor %xmm3, %xmm3
127 + .p2align 4
128 +L(align16_loop):
129 + pcmpeqb 16(%eax), %xmm0
130 + pmovmskb %xmm0, %edx
131 + test %edx, %edx
132 + jnz L(exit16)
133 +
134 + pcmpeqb 32(%eax), %xmm1
135 + pmovmskb %xmm1, %edx
136 + test %edx, %edx
137 + jnz L(exit32)
138 +
139 + pcmpeqb 48(%eax), %xmm2
140 + pmovmskb %xmm2, %edx
141 + test %edx, %edx
142 + jnz L(exit48)
143 +
144 + pcmpeqb 64(%eax), %xmm3
145 + pmovmskb %xmm3, %edx
146 + lea 64(%eax), %eax
147 + test %edx, %edx
148 + jz L(align16_loop)
149 +L(exit):
150 + sub %edi, %eax
151 +L(exit_less16):
152 + bsf %edx, %edx
153 + add %edx, %eax
154 + RETURN
155 +L(exit16):
156 + sub %edi, %eax
157 + bsf %edx, %edx
158 + add %edx, %eax
159 + add $16, %eax
160 + RETURN
161 +L(exit32):
162 + sub %edi, %eax
163 + bsf %edx, %edx
164 + add %edx, %eax
165 + add $32, %eax
166 + RETURN
167 +L(exit48):
168 + sub %edi, %eax
169 + bsf %edx, %edx
170 + add %edx, %eax
171 + add $48, %eax
172 + POP (%edi)
173 + POP (%esi)
174 + ret
175 +
176 +END ( __strlen_sse2_bsf)
177 +
178 +#endif
179 Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S
180 ===================================================================
181 --- /dev/null
182 +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S
183 @@ -0,0 +1,347 @@
184 +/* strlen with SSE2
185 + Copyright (C) 2010 Free Software Foundation, Inc.
186 + Contributed by Intel Corporation.
187 + This file is part of the GNU C Library.
188 +
189 + The GNU C Library is free software; you can redistribute it and/or
190 + modify it under the terms of the GNU Lesser General Public
191 + License as published by the Free Software Foundation; either
192 + version 2.1 of the License, or (at your option) any later version.
193 +
194 + The GNU C Library is distributed in the hope that it will be useful,
195 + but WITHOUT ANY WARRANTY; without even the implied warranty of
196 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
197 + Lesser General Public License for more details.
198 +
199 + You should have received a copy of the GNU Lesser General Public
200 + License along with the GNU C Library; if not, write to the Free
201 + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
202 + 02111-1307 USA. */
203 +
204 +#if defined SHARED && !defined NOT_IN_libc
205 +
206 +#include <sysdep.h>
207 +#include "asm-syntax.h"
208 +
209 +#define CFI_PUSH(REG) \
210 + cfi_adjust_cfa_offset (4); \
211 + cfi_rel_offset (REG, 0)
212 +
213 +#define CFI_POP(REG) \
214 + cfi_adjust_cfa_offset (-4); \
215 + cfi_restore (REG)
216 +
217 +#define PUSH(REG) pushl REG; CFI_PUSH (REG)
218 +#define POP(REG) popl REG; CFI_POP (REG)
219 +#define PARMS 4
220 +#define STR PARMS
221 +#define ENTRANCE
222 +#define RETURN ret
223 +
224 + .text
225 +ENTRY (__strlen_sse2)
226 + ENTRANCE
227 + mov STR(%esp), %edx
228 + xor %eax, %eax
229 + cmpb $0, (%edx)
230 + jz L(exit_tail0)
231 + cmpb $0, 1(%edx)
232 + jz L(exit_tail1)
233 + cmpb $0, 2(%edx)
234 + jz L(exit_tail2)
235 + cmpb $0, 3(%edx)
236 + jz L(exit_tail3)
237 + cmpb $0, 4(%edx)
238 + jz L(exit_tail4)
239 + cmpb $0, 5(%edx)
240 + jz L(exit_tail5)
241 + cmpb $0, 6(%edx)
242 + jz L(exit_tail6)
243 + cmpb $0, 7(%edx)
244 + jz L(exit_tail7)
245 + cmpb $0, 8(%edx)
246 + jz L(exit_tail8)
247 + cmpb $0, 9(%edx)
248 + jz L(exit_tail9)
249 + cmpb $0, 10(%edx)
250 + jz L(exit_tail10)
251 + cmpb $0, 11(%edx)
252 + jz L(exit_tail11)
253 + cmpb $0, 12(%edx)
254 + jz L(exit_tail12)
255 + cmpb $0, 13(%edx)
256 + jz L(exit_tail13)
257 + cmpb $0, 14(%edx)
258 + jz L(exit_tail14)
259 + cmpb $0, 15(%edx)
260 + jz L(exit_tail15)
261 + pxor %xmm0, %xmm0
262 + mov %edx, %eax
263 + mov %edx, %ecx
264 + and $-16, %eax
265 + add $16, %ecx
266 + add $16, %eax
267 +
268 + pcmpeqb (%eax), %xmm0
269 + pmovmskb %xmm0, %edx
270 + pxor %xmm1, %xmm1
271 + test %edx, %edx
272 + lea 16(%eax), %eax
273 + jnz L(exit)
274 +
275 + pcmpeqb (%eax), %xmm1
276 + pmovmskb %xmm1, %edx
277 + pxor %xmm2, %xmm2
278 + test %edx, %edx
279 + lea 16(%eax), %eax
280 + jnz L(exit)
281 +
282 +
283 + pcmpeqb (%eax), %xmm2
284 + pmovmskb %xmm2, %edx
285 + pxor %xmm3, %xmm3
286 + test %edx, %edx
287 + lea 16(%eax), %eax
288 + jnz L(exit)
289 +
290 + pcmpeqb (%eax), %xmm3
291 + pmovmskb %xmm3, %edx
292 + test %edx, %edx
293 + lea 16(%eax), %eax
294 + jnz L(exit)
295 +
296 + pcmpeqb (%eax), %xmm0
297 + pmovmskb %xmm0, %edx
298 + test %edx, %edx
299 + lea 16(%eax), %eax
300 + jnz L(exit)
301 +
302 + pcmpeqb (%eax), %xmm1
303 + pmovmskb %xmm1, %edx
304 + test %edx, %edx
305 + lea 16(%eax), %eax
306 + jnz L(exit)
307 +
308 + pcmpeqb (%eax), %xmm2
309 + pmovmskb %xmm2, %edx
310 + test %edx, %edx
311 + lea 16(%eax), %eax
312 + jnz L(exit)
313 +
314 + pcmpeqb (%eax), %xmm3
315 + pmovmskb %xmm3, %edx
316 + test %edx, %edx
317 + lea 16(%eax), %eax
318 + jnz L(exit)
319 +
320 + pcmpeqb (%eax), %xmm0
321 + pmovmskb %xmm0, %edx
322 + test %edx, %edx
323 + lea 16(%eax), %eax
324 + jnz L(exit)
325 +
326 + pcmpeqb (%eax), %xmm1
327 + pmovmskb %xmm1, %edx
328 + test %edx, %edx
329 + lea 16(%eax), %eax
330 + jnz L(exit)
331 +
332 + pcmpeqb (%eax), %xmm2
333 + pmovmskb %xmm2, %edx
334 + test %edx, %edx
335 + lea 16(%eax), %eax
336 + jnz L(exit)
337 +
338 + pcmpeqb (%eax), %xmm3
339 + pmovmskb %xmm3, %edx
340 + test %edx, %edx
341 + lea 16(%eax), %eax
342 + jnz L(exit)
343 +
344 + pcmpeqb (%eax), %xmm0
345 + pmovmskb %xmm0, %edx
346 + test %edx, %edx
347 + lea 16(%eax), %eax
348 + jnz L(exit)
349 +
350 + pcmpeqb (%eax), %xmm1
351 + pmovmskb %xmm1, %edx
352 + test %edx, %edx
353 + lea 16(%eax), %eax
354 + jnz L(exit)
355 +
356 + pcmpeqb (%eax), %xmm2
357 + pmovmskb %xmm2, %edx
358 + test %edx, %edx
359 + lea 16(%eax), %eax
360 + jnz L(exit)
361 +
362 + pcmpeqb (%eax), %xmm3
363 + pmovmskb %xmm3, %edx
364 + test %edx, %edx
365 + lea 16(%eax), %eax
366 + jnz L(exit)
367 +
368 + and $-0x40, %eax
369 + PUSH (%esi)
370 + PUSH (%edi)
371 + PUSH (%ebx)
372 + PUSH (%ebp)
373 + xor %ebp, %ebp
374 +L(aligned_64):
375 + pcmpeqb (%eax), %xmm0
376 + pcmpeqb 16(%eax), %xmm1
377 + pcmpeqb 32(%eax), %xmm2
378 + pcmpeqb 48(%eax), %xmm3
379 + pmovmskb %xmm0, %edx
380 + pmovmskb %xmm1, %esi
381 + pmovmskb %xmm2, %edi
382 + pmovmskb %xmm3, %ebx
383 + or %edx, %ebp
384 + or %esi, %ebp
385 + or %edi, %ebp
386 + or %ebx, %ebp
387 + lea 64(%eax), %eax
388 + jz L(aligned_64)
389 +L(48leave):
390 + test %edx, %edx
391 + jnz L(aligned_64_exit_16)
392 + test %esi, %esi
393 + jnz L(aligned_64_exit_32)
394 + test %edi, %edi
395 + jnz L(aligned_64_exit_48)
396 + mov %ebx, %edx
397 + lea (%eax), %eax
398 + jmp L(aligned_64_exit)
399 +L(aligned_64_exit_48):
400 + lea -16(%eax), %eax
401 + mov %edi, %edx
402 + jmp L(aligned_64_exit)
403 +L(aligned_64_exit_32):
404 + lea -32(%eax), %eax
405 + mov %esi, %edx
406 + jmp L(aligned_64_exit)
407 +L(aligned_64_exit_16):
408 + lea -48(%eax), %eax
409 +L(aligned_64_exit):
410 + POP (%ebp)
411 + POP (%ebx)
412 + POP (%edi)
413 + POP (%esi)
414 +L(exit):
415 + sub %ecx, %eax
416 + test %dl, %dl
417 + jz L(exit_high)
418 + test $0x01, %dl
419 + jnz L(exit_tail0)
420 +
421 + test $0x02, %dl
422 + jnz L(exit_tail1)
423 +
424 + test $0x04, %dl
425 + jnz L(exit_tail2)
426 +
427 + test $0x08, %dl
428 + jnz L(exit_tail3)
429 +
430 + test $0x10, %dl
431 + jnz L(exit_tail4)
432 +
433 + test $0x20, %dl
434 + jnz L(exit_tail5)
435 +
436 + test $0x40, %dl
437 + jnz L(exit_tail6)
438 + add $7, %eax
439 +L(exit_tail0):
440 + RETURN
441 +
442 +L(exit_high):
443 + add $8, %eax
444 + test $0x01, %dh
445 + jnz L(exit_tail0)
446 +
447 + test $0x02, %dh
448 + jnz L(exit_tail1)
449 +
450 + test $0x04, %dh
451 + jnz L(exit_tail2)
452 +
453 + test $0x08, %dh
454 + jnz L(exit_tail3)
455 +
456 + test $0x10, %dh
457 + jnz L(exit_tail4)
458 +
459 + test $0x20, %dh
460 + jnz L(exit_tail5)
461 +
462 + test $0x40, %dh
463 + jnz L(exit_tail6)
464 + add $7, %eax
465 + RETURN
466 +
467 + .p2align 4
468 +L(exit_tail1):
469 + add $1, %eax
470 + RETURN
471 +
472 +L(exit_tail2):
473 + add $2, %eax
474 + RETURN
475 +
476 +L(exit_tail3):
477 + add $3, %eax
478 + RETURN
479 +
480 +L(exit_tail4):
481 + add $4, %eax
482 + RETURN
483 +
484 +L(exit_tail5):
485 + add $5, %eax
486 + RETURN
487 +
488 +L(exit_tail6):
489 + add $6, %eax
490 + RETURN
491 +
492 +L(exit_tail7):
493 + add $7, %eax
494 + RETURN
495 +
496 +L(exit_tail8):
497 + add $8, %eax
498 + RETURN
499 +
500 +L(exit_tail9):
501 + add $9, %eax
502 + RETURN
503 +
504 +L(exit_tail10):
505 + add $10, %eax
506 + RETURN
507 +
508 +L(exit_tail11):
509 + add $11, %eax
510 + RETURN
511 +
512 +L(exit_tail12):
513 + add $12, %eax
514 + RETURN
515 +
516 +L(exit_tail13):
517 + add $13, %eax
518 + RETURN
519 +
520 +L(exit_tail14):
521 + add $14, %eax
522 + RETURN
523 +
524 +L(exit_tail15):
525 + add $15, %eax
526 + ret
527 +
528 +END (__strlen_sse2)
529 +
530 +#endif
531 Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S
532 ===================================================================
533 --- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/strlen.S
534 +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S
535 @@ -48,6 +48,9 @@ ENTRY(strlen)
536 1: leal __strlen_ia32@GOTOFF(%ebx), %eax
537 testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
538 jz 2f
539 + leal __strlen_sse2_bsf@GOTOFF(%ebx), %eax
540 + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
541 + jz 2f
542 leal __strlen_sse2@GOTOFF(%ebx), %eax
543 2: popl %ebx
544 cfi_adjust_cfa_offset (-4);
545 @@ -55,84 +58,6 @@ ENTRY(strlen)
546 ret
547 END(strlen)
548
549 -#define CFI_POP(REG) \
550 - cfi_adjust_cfa_offset (-4); \
551 - cfi_restore (REG)
552 -
553 -#define RETURN popl %esi; CFI_POP (esi); ret
554 -
555 - .text
556 -ENTRY (__strlen_sse2)
557 -/*
558 - * This implementation uses SSE instructions to compare up to 16 bytes
559 - * at a time looking for the end of string (null char).
560 - */
561 - pushl %esi
562 - cfi_adjust_cfa_offset (4)
563 - cfi_rel_offset (%esi, 0)
564 - mov 8(%esp), %eax
565 - mov %eax, %ecx
566 - pxor %xmm0, %xmm0 /* 16 null chars */
567 - mov %eax, %esi
568 - and $15, %ecx
569 - jz 1f /* string is 16 byte aligned */
570 -
571 - /*
572 - * Unaligned case. Round down to 16-byte boundary before comparing
573 - * 16 bytes for a null char. The code then compensates for any extra chars
574 - * preceding the start of the string.
575 - */
576 - and $-16, %esi
577 -
578 - pcmpeqb (%esi), %xmm0
579 - lea 16(%eax), %esi
580 - pmovmskb %xmm0, %edx
581 -
582 - shr %cl, %edx /* Compensate for bytes preceding the string */
583 - test %edx, %edx
584 - jnz 2f
585 - sub %ecx, %esi /* no null, adjust to next 16-byte boundary */
586 - pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
587 -
588 - .p2align 4
589 -1: /* 16 byte aligned */
590 - pcmpeqb (%esi), %xmm0 /* look for null bytes */
591 - pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
592 -
593 - add $16, %esi /* prepare to search next 16 bytes */
594 - test %edx, %edx /* if no null byte, %edx must be 0 */
595 - jnz 2f /* found a null */
596 -
597 - pcmpeqb (%esi), %xmm0
598 - pmovmskb %xmm0, %edx
599 - add $16, %esi
600 - test %edx, %edx
601 - jnz 2f
602 -
603 - pcmpeqb (%esi), %xmm0
604 - pmovmskb %xmm0, %edx
605 - add $16, %esi
606 - test %edx, %edx
607 - jnz 2f
608 -
609 - pcmpeqb (%esi), %xmm0
610 - pmovmskb %xmm0, %edx
611 - add $16, %esi
612 - test %edx, %edx
613 - jz 1b
614 -
615 -2:
616 - neg %eax
617 - lea -16(%eax, %esi), %eax /* calculate exact offset */
618 - bsf %edx, %ecx /* Least significant 1 bit is index of null */
619 - add %ecx, %eax
620 - popl %esi
621 - cfi_adjust_cfa_offset (-4)
622 - cfi_restore (%esi)
623 - ret
624 -
625 -END (__strlen_sse2)
626 -
627 # undef ENTRY
628 # define ENTRY(name) \
629 .type __strlen_ia32, @function; \
630 --- a/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:43:30.060487726 -0700
631 +++ b/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:45:57.894692115 -0700
632 @@ -7,7 +7,7 @@ ifeq ($(subdir),string)
633 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
634 strend-sse4 memcmp-sse4 \
635 strcasestr-nonascii strcasecmp_l-ssse3 \
636 - strncase_l-ssse3 \
637 + strncase_l-ssse3 strlen-sse4 strlen-no-bsf \
638 memset-x86-64
639 ifeq (yes,$(config-cflags-sse4))
640 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
641 Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c
642 ===================================================================
643 --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/init-arch.c
644 +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c
645 @@ -77,6 +77,12 @@ __init_cpu_features (void)
646 model += extended_model;
647 switch (model)
648 {
649 + case 0x1c:
650 + case 0x26:
651 + /* BSF is slow on Atom. */
652 + __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
653 + break;
654 +
655 case 0x1a:
656 case 0x1e:
657 case 0x1f:
658 --- a/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:43:30.061487720 -0700
659 +++ b/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:48:13.371963005 -0700
660 @@ -17,6 +17,7 @@
661 02111-1307 USA. */
662
663 #define bit_Fast_Rep_String (1 << 0)
664 +#define bit_Slow_BSF (1 << 2)
665 #define bit_Prefer_SSE_for_memop (1 << 3)
666
667 #ifdef __ASSEMBLER__
668 @@ -34,6 +35,7 @@
669 # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
670
671 #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
672 +# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE
673 # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE
674
675 #else /* __ASSEMBLER__ */
676 @@ -105,11 +107,15 @@ extern const struct cpu_features *__get_
677 # define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
678
679 # define index_Fast_Rep_String FEATURE_INDEX_1
680 +# define index_Slow_BSF FEATURE_INDEX_1
681 # define index_Prefer_SSE_for_memop FEATURE_INDEX_1
682
683 #define HAS_ARCH_FEATURE(idx, bit) \
684 ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
685
686 +#define HAS_SLOW_BSF \
687 + HAS_ARCH_FEATURE (index_Slow_BSF, bit_Slow_BSF)
688 +
689 #define HAS_PREFER_SSE_FOR_MEMOP \
690 HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop)
691
692 Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S
693 ===================================================================
694 --- /dev/null
695 +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S
696 @@ -0,0 +1,309 @@
697 +/* strlen without BSF
698 + Copyright (C) 2010 Free Software Foundation, Inc.
699 + Contributed by Intel Corporation.
700 + This file is part of the GNU C Library.
701 +
702 + The GNU C Library is free software; you can redistribute it and/or
703 + modify it under the terms of the GNU Lesser General Public
704 + License as published by the Free Software Foundation; either
705 + version 2.1 of the License, or (at your option) any later version.
706 +
707 + The GNU C Library is distributed in the hope that it will be useful,
708 + but WITHOUT ANY WARRANTY; without even the implied warranty of
709 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
710 + Lesser General Public License for more details.
711 +
712 + You should have received a copy of the GNU Lesser General Public
713 + License along with the GNU C Library; if not, write to the Free
714 + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
715 + 02111-1307 USA. */
716 +
717 +#if defined SHARED && !defined NOT_IN_libc
718 +
719 +#include <sysdep.h>
720 +
721 + .section .text.slow,"ax",@progbits
722 +ENTRY (__strlen_no_bsf)
723 + xor %eax, %eax
724 + cmpb $0, (%rdi)
725 + jz L(exit_tail0)
726 + cmpb $0, 1(%rdi)
727 + jz L(exit_tail1)
728 + cmpb $0, 2(%rdi)
729 + jz L(exit_tail2)
730 + cmpb $0, 3(%rdi)
731 + jz L(exit_tail3)
732 + cmpb $0, 4(%rdi)
733 + jz L(exit_tail4)
734 + cmpb $0, 5(%rdi)
735 + jz L(exit_tail5)
736 + cmpb $0, 6(%rdi)
737 + jz L(exit_tail6)
738 + cmpb $0, 7(%rdi)
739 + jz L(exit_tail7)
740 + cmpb $0, 8(%rdi)
741 + jz L(exit_tail8)
742 + cmpb $0, 9(%rdi)
743 + jz L(exit_tail9)
744 + cmpb $0, 10(%rdi)
745 + jz L(exit_tail10)
746 + cmpb $0, 11(%rdi)
747 + jz L(exit_tail11)
748 + cmpb $0, 12(%rdi)
749 + jz L(exit_tail12)
750 + cmpb $0, 13(%rdi)
751 + jz L(exit_tail13)
752 + cmpb $0, 14(%rdi)
753 + jz L(exit_tail14)
754 + cmpb $0, 15(%rdi)
755 + jz L(exit_tail15)
756 + pxor %xmm0, %xmm0
757 + mov %rdi, %rcx
758 + mov %rdi, %rax
759 + and $-16, %rax
760 + add $16, %rax
761 + add $16, %rcx
762 +
763 + pcmpeqb (%rax), %xmm0
764 + pmovmskb %xmm0, %edx
765 + pxor %xmm1, %xmm1
766 + test %edx, %edx
767 + lea 16(%rax), %rax
768 + jnz L(exit)
769 +
770 + pcmpeqb (%rax), %xmm1
771 + pmovmskb %xmm1, %edx
772 + pxor %xmm2, %xmm2
773 + test %edx, %edx
774 + lea 16(%rax), %rax
775 + jnz L(exit)
776 +
777 +
778 + pcmpeqb (%rax), %xmm2
779 + pmovmskb %xmm2, %edx
780 + pxor %xmm3, %xmm3
781 + test %edx, %edx
782 + lea 16(%rax), %rax
783 + jnz L(exit)
784 +
785 + pcmpeqb (%rax), %xmm3
786 + pmovmskb %xmm3, %edx
787 + test %edx, %edx
788 + lea 16(%rax), %rax
789 + jnz L(exit)
790 +
791 + pcmpeqb (%rax), %xmm0
792 + pmovmskb %xmm0, %edx
793 + test %edx, %edx
794 + lea 16(%rax), %rax
795 + jnz L(exit)
796 +
797 + pcmpeqb (%rax), %xmm1
798 + pmovmskb %xmm1, %edx
799 + test %edx, %edx
800 + lea 16(%rax), %rax
801 + jnz L(exit)
802 +
803 + pcmpeqb (%rax), %xmm2
804 + pmovmskb %xmm2, %edx
805 + test %edx, %edx
806 + lea 16(%rax), %rax
807 + jnz L(exit)
808 +
809 + pcmpeqb (%rax), %xmm3
810 + pmovmskb %xmm3, %edx
811 + test %edx, %edx
812 + lea 16(%rax), %rax
813 + jnz L(exit)
814 +
815 + pcmpeqb (%rax), %xmm0
816 + pmovmskb %xmm0, %edx
817 + test %edx, %edx
818 + lea 16(%rax), %rax
819 + jnz L(exit)
820 +
821 + pcmpeqb (%rax), %xmm1
822 + pmovmskb %xmm1, %edx
823 + test %edx, %edx
824 + lea 16(%rax), %rax
825 + jnz L(exit)
826 +
827 + pcmpeqb (%rax), %xmm2
828 + pmovmskb %xmm2, %edx
829 + test %edx, %edx
830 + lea 16(%rax), %rax
831 + jnz L(exit)
832 +
833 + pcmpeqb (%rax), %xmm3
834 + pmovmskb %xmm3, %edx
835 + test %edx, %edx
836 + lea 16(%rax), %rax
837 + jnz L(exit)
838 +
839 + pcmpeqb (%rax), %xmm0
840 + pmovmskb %xmm0, %edx
841 + test %edx, %edx
842 + lea 16(%rax), %rax
843 + jnz L(exit)
844 +
845 + pcmpeqb (%rax), %xmm1
846 + pmovmskb %xmm1, %edx
847 + test %edx, %edx
848 + lea 16(%rax), %rax
849 + jnz L(exit)
850 +
851 + pcmpeqb (%rax), %xmm2
852 + pmovmskb %xmm2, %edx
853 + test %edx, %edx
854 + lea 16(%rax), %rax
855 + jnz L(exit)
856 +
857 + pcmpeqb (%rax), %xmm3
858 + pmovmskb %xmm3, %edx
859 + test %edx, %edx
860 + lea 16(%rax), %rax
861 + jnz L(exit)
862 +
863 + and $-0x40, %rax
864 + xor %r8d, %r8d
865 +L(aligned_64):
866 + pcmpeqb (%rax), %xmm0
867 + pcmpeqb 16(%rax), %xmm1
868 + pcmpeqb 32(%rax), %xmm2
869 + pcmpeqb 48(%rax), %xmm3
870 + pmovmskb %xmm0, %edx
871 + pmovmskb %xmm1, %esi
872 + pmovmskb %xmm2, %edi
873 + pmovmskb %xmm3, %r9d
874 + or %edx, %r8d
875 + or %esi, %r8d
876 + or %edi, %r8d
877 + or %r9d, %r8d
878 + lea 64(%rax), %rax
879 + jz L(aligned_64)
880 +
881 + test %edx, %edx
882 + jnz L(aligned_64_exit_16)
883 + test %esi, %esi
884 + jnz L(aligned_64_exit_32)
885 + test %edi, %edi
886 + jnz L(aligned_64_exit_48)
887 +L(aligned_64_exit_64):
888 + mov %r9d, %edx
889 + jmp L(aligned_64_exit)
890 +L(aligned_64_exit_48):
891 + lea -16(%rax), %rax
892 + mov %edi, %edx
893 + jmp L(aligned_64_exit)
894 +L(aligned_64_exit_32):
895 + lea -32(%rax), %rax
896 + mov %esi, %edx
897 + jmp L(aligned_64_exit)
898 +L(aligned_64_exit_16):
899 + lea -48(%rax), %rax
900 +L(aligned_64_exit):
901 +L(exit):
902 + sub %rcx, %rax
903 + test %dl, %dl
904 + jz L(exit_high)
905 + test $0x01, %dl
906 + jnz L(exit_tail0)
907 +
908 + test $0x02, %dl
909 + jnz L(exit_tail1)
910 +
911 + test $0x04, %dl
912 + jnz L(exit_tail2)
913 +
914 + test $0x08, %dl
915 + jnz L(exit_tail3)
916 +
917 + test $0x10, %dl
918 + jnz L(exit_tail4)
919 +
920 + test $0x20, %dl
921 + jnz L(exit_tail5)
922 +
923 + test $0x40, %dl
924 + jnz L(exit_tail6)
925 + add $7, %eax
926 +L(exit_tail0):
927 + ret
928 +
929 +L(exit_high):
930 + add $8, %eax
931 + test $0x01, %dh
932 + jnz L(exit_tail0)
933 +
934 + test $0x02, %dh
935 + jnz L(exit_tail1)
936 +
937 + test $0x04, %dh
938 + jnz L(exit_tail2)
939 +
940 + test $0x08, %dh
941 + jnz L(exit_tail3)
942 +
943 + test $0x10, %dh
944 + jnz L(exit_tail4)
945 +
946 + test $0x20, %dh
947 + jnz L(exit_tail5)
948 +
949 + test $0x40, %dh
950 + jnz L(exit_tail6)
951 + add $7, %eax
952 + ret
953 + .p2align 4
954 +L(exit_tail1):
955 + add $1, %eax
956 + ret
957 +
958 +L(exit_tail2):
959 + add $2, %eax
960 + ret
961 +
962 +L(exit_tail3):
963 + add $3, %eax
964 + ret
965 +
966 +L(exit_tail4):
967 + add $4, %eax
968 + ret
969 +
970 +L(exit_tail5):
971 + add $5, %eax
972 + ret
973 +L(exit_tail6):
974 + add $6, %eax
975 + ret
976 +L(exit_tail7):
977 + add $7, %eax
978 + ret
979 +L(exit_tail8):
980 + add $8, %eax
981 + ret
982 +L(exit_tail9):
983 + add $9, %eax
984 + ret
985 +L(exit_tail10):
986 + add $10, %eax
987 + ret
988 +L(exit_tail11):
989 + add $11, %eax
990 + ret
991 +L(exit_tail12):
992 + add $12, %eax
993 + ret
994 +L(exit_tail13):
995 + add $13, %eax
996 + ret
997 +L(exit_tail14):
998 + add $14, %eax
999 + ret
1000 +L(exit_tail15):
1001 + add $15, %eax
1002 + ret
1003 +END (__strlen_no_bsf)
1004 +
1005 +#endif
1006 Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S
1007 ===================================================================
1008 --- /dev/null
1009 +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S
1010 @@ -0,0 +1,85 @@
1011 +/* strlen with SSE4
1012 + Copyright (C) 2009, 2010 Free Software Foundation, Inc.
1013 + Contributed by Ulrich Drepper <drepper@redhat.com>.
1014 + This file is part of the GNU C Library.
1015 +
1016 + The GNU C Library is free software; you can redistribute it and/or
1017 + modify it under the terms of the GNU Lesser General Public
1018 + License as published by the Free Software Foundation; either
1019 + version 2.1 of the License, or (at your option) any later version.
1020 +
1021 + The GNU C Library is distributed in the hope that it will be useful,
1022 + but WITHOUT ANY WARRANTY; without even the implied warranty of
1023 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1024 + Lesser General Public License for more details.
1025 +
1026 + You should have received a copy of the GNU Lesser General Public
1027 + License along with the GNU C Library; if not, write to the Free
1028 + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1029 + 02111-1307 USA. */
1030 +
1031 +#if defined SHARED && !defined NOT_IN_libc
1032 +
1033 +#include <sysdep.h>
1034 +
1035 + .section .text.sse4.2,"ax",@progbits
1036 +ENTRY (__strlen_sse42)
1037 + pxor %xmm1, %xmm1
1038 + movl %edi, %ecx
1039 + movq %rdi, %r8
1040 + andq $~15, %rdi
1041 + xor %edi, %ecx
1042 + pcmpeqb (%rdi), %xmm1
1043 + pmovmskb %xmm1, %edx
1044 + shrl %cl, %edx
1045 + shll %cl, %edx
1046 + andl %edx, %edx
1047 + jnz L(less16bytes)
1048 + pxor %xmm1, %xmm1
1049 +
1050 + .p2align 4
1051 +L(more64bytes_loop):
1052 + pcmpistri $0x08, 16(%rdi), %xmm1
1053 + jz L(more32bytes)
1054 +
1055 + pcmpistri $0x08, 32(%rdi), %xmm1
1056 + jz L(more48bytes)
1057 +
1058 + pcmpistri $0x08, 48(%rdi), %xmm1
1059 + jz L(more64bytes)
1060 +
1061 + add $64, %rdi
1062 + pcmpistri $0x08, (%rdi), %xmm1
1063 + jnz L(more64bytes_loop)
1064 + leaq (%rdi,%rcx), %rax
1065 + subq %r8, %rax
1066 + ret
1067 +
1068 + .p2align 4
1069 +L(more32bytes):
1070 + leaq 16(%rdi,%rcx, 1), %rax
1071 + subq %r8, %rax
1072 + ret
1073 +
1074 + .p2align 4
1075 +L(more48bytes):
1076 + leaq 32(%rdi,%rcx, 1), %rax
1077 + subq %r8, %rax
1078 + ret
1079 +
1080 + .p2align 4
1081 +L(more64bytes):
1082 + leaq 48(%rdi,%rcx, 1), %rax
1083 + subq %r8, %rax
1084 + ret
1085 +
1086 + .p2align 4
1087 +L(less16bytes):
1088 + subq %r8, %rdi
1089 + bsfl %edx, %eax
1090 + addq %rdi, %rax
1091 + ret
1092 +
1093 +END (__strlen_sse42)
1094 +
1095 +#endif
1096 Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S
1097 ===================================================================
1098 --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/strlen.S
1099 +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S
1100 @@ -36,74 +36,12 @@ ENTRY(strlen)
1101 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
1102 jz 2f
1103 leaq __strlen_sse42(%rip), %rax
1104 -2: ret
1105 -END(strlen)
1106 -
1107 -
1108 - .section .text.sse4.2,"ax",@progbits
1109 - .align 16
1110 - .type __strlen_sse42, @function
1111 -__strlen_sse42:
1112 - cfi_startproc
1113 - CALL_MCOUNT
1114 - pxor %xmm1, %xmm1
1115 - movl %edi, %ecx
1116 - movq %rdi, %r8
1117 - andq $~15, %rdi
1118 - xor %edi, %ecx
1119 - pcmpeqb (%rdi), %xmm1
1120 - pmovmskb %xmm1, %edx
1121 - shrl %cl, %edx
1122 - shll %cl, %edx
1123 - andl %edx, %edx
1124 - jnz L(less16bytes)
1125 - pxor %xmm1, %xmm1
1126 -
1127 - .p2align 4
1128 -L(more64bytes_loop):
1129 - pcmpistri $0x08, 16(%rdi), %xmm1
1130 - jz L(more32bytes)
1131 -
1132 - pcmpistri $0x08, 32(%rdi), %xmm1
1133 - jz L(more48bytes)
1134 -
1135 - pcmpistri $0x08, 48(%rdi), %xmm1
1136 - jz L(more64bytes)
1137 -
1138 - add $64, %rdi
1139 - pcmpistri $0x08, (%rdi), %xmm1
1140 - jnz L(more64bytes_loop)
1141 - leaq (%rdi,%rcx), %rax
1142 - subq %r8, %rax
1143 - ret
1144 -
1145 - .p2align 4
1146 -L(more32bytes):
1147 - leaq 16(%rdi,%rcx, 1), %rax
1148 - subq %r8, %rax
1149 - ret
1150 -
1151 - .p2align 4
1152 -L(more48bytes):
1153 - leaq 32(%rdi,%rcx, 1), %rax
1154 - subq %r8, %rax
1155 - ret
1156 -
1157 - .p2align 4
1158 -L(more64bytes):
1159 - leaq 48(%rdi,%rcx, 1), %rax
1160 - subq %r8, %rax
1161 ret
1162 -
1163 - .p2align 4
1164 -L(less16bytes):
1165 - subq %r8, %rdi
1166 - bsfl %edx, %eax
1167 - addq %rdi, %rax
1168 - ret
1169 - cfi_endproc
1170 - .size __strlen_sse42, .-__strlen_sse42
1171 -
1172 +2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
1173 + jz 3f
1174 + leaq __strlen_no_bsf(%rip), %rax
1175 +3: ret
1176 +END(strlen)
1177
1178 # undef ENTRY
1179 # define ENTRY(name) \
1180 Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S
1181 ===================================================================
1182 --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/strlen.S
1183 +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S
1184 @@ -23,29 +23,80 @@
1185
1186 .text
1187 ENTRY(strlen)
1188 - pxor %xmm2, %xmm2
1189 - movq %rdi, %rcx
1190 - movq %rdi, %r8
1191 - andq $~15, %rdi
1192 - movdqa %xmm2, %xmm1
1193 - pcmpeqb (%rdi), %xmm2
1194 - orl $0xffffffff, %esi
1195 - subq %rdi, %rcx
1196 - shll %cl, %esi
1197 - pmovmskb %xmm2, %edx
1198 - andl %esi, %edx
1199 - jnz 1f
1200 -
1201 -2: movdqa 16(%rdi), %xmm0
1202 - leaq 16(%rdi), %rdi
1203 + xor %rax, %rax
1204 + mov %edi, %ecx
1205 + and $0x3f, %ecx
1206 + pxor %xmm0, %xmm0
1207 + cmp $0x30, %ecx
1208 + ja L(next)
1209 + movdqu (%rdi), %xmm1
1210 pcmpeqb %xmm1, %xmm0
1211 pmovmskb %xmm0, %edx
1212 - testl %edx, %edx
1213 - jz 2b
1214 + test %edx, %edx
1215 + jnz L(exit_less16)
1216 + mov %rdi, %rax
1217 + and $-16, %rax
1218 + jmp L(align16_start)
1219 +L(next):
1220 + mov %rdi, %rax
1221 + and $-16, %rax
1222 + pcmpeqb (%rax), %xmm0
1223 + mov $-1, %esi
1224 + sub %rax, %rcx
1225 + shl %cl, %esi
1226 + pmovmskb %xmm0, %edx
1227 + and %esi, %edx
1228 + jnz L(exit)
1229 +L(align16_start):
1230 + pxor %xmm0, %xmm0
1231 + pxor %xmm1, %xmm1
1232 + pxor %xmm2, %xmm2
1233 + pxor %xmm3, %xmm3
1234 + .p2align 4
1235 +L(align16_loop):
1236 + pcmpeqb 16(%rax), %xmm0
1237 + pmovmskb %xmm0, %edx
1238 + test %edx, %edx
1239 + jnz L(exit16)
1240
1241 -1: subq %r8, %rdi
1242 - bsfl %edx, %eax
1243 - addq %rdi, %rax
1244 + pcmpeqb 32(%rax), %xmm1
1245 + pmovmskb %xmm1, %edx
1246 + test %edx, %edx
1247 + jnz L(exit32)
1248 +
1249 + pcmpeqb 48(%rax), %xmm2
1250 + pmovmskb %xmm2, %edx
1251 + test %edx, %edx
1252 + jnz L(exit48)
1253 +
1254 + pcmpeqb 64(%rax), %xmm3
1255 + pmovmskb %xmm3, %edx
1256 + lea 64(%rax), %rax
1257 + test %edx, %edx
1258 + jz L(align16_loop)
1259 +L(exit):
1260 + sub %rdi, %rax
1261 +L(exit_less16):
1262 + bsf %rdx, %rdx
1263 + add %rdx, %rax
1264 + ret
1265 + .p2align 4
1266 +L(exit16):
1267 + sub %rdi, %rax
1268 + bsf %rdx, %rdx
1269 + lea 16(%rdx,%rax), %rax
1270 + ret
1271 + .p2align 4
1272 +L(exit32):
1273 + sub %rdi, %rax
1274 + bsf %rdx, %rdx
1275 + lea 32(%rdx,%rax), %rax
1276 + ret
1277 + .p2align 4
1278 +L(exit48):
1279 + sub %rdi, %rax
1280 + bsf %rdx, %rdx
1281 + lea 48(%rdx,%rax), %rax
1282 ret
1283 END(strlen)
1284 libc_hidden_builtin_def (strlen)