]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/glibc/glibc-rh695963.patch
dhcpcd: fix delay after dhcp down.
[ipfire-2.x.git] / src / patches / glibc / glibc-rh695963.patch
CommitLineData
12788f63
MT
12010-08-27 Ulrich Drepper <drepper@redhat.com>
2
3 * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Move to .text.slow section.
4
5 * sysdeps/x86_64/strlen.S: Minimal code improvement.
6
72010-08-26 H.J. Lu <hongjiu.lu@intel.com>
8
9 * sysdeps/x86_64/strlen.S: Unroll the loop.
10 * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
11 strlen-sse2 strlen-sse2-bsf.
12 * sysdeps/x86_64/multiarch/strlen.S ((strlen): Return
13 __strlen_no_bsf if bit_Slow_BSF is set.
14 (__strlen_sse42): Removed.
15 * sysdeps/x86_64/multiarch/strlen-no-bsf.S: New file.
16 * sysdeps/x86_64/multiarch/strlen-sse4.S: New file.
17
182010-08-25 H.J. Lu <hongjiu.lu@intel.com>
19
20 * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
21 strlen-sse2 strlen-sse2-bsf.
22 * sysdeps/i386/i686/multiarch/strlen.S (strlen): Return
23 __strlen_sse2_bsf if bit_Slow_BSF is unset.
24 (__strlen_sse2): Removed.
25 * sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S: New file.
26 * sysdeps/i386/i686/multiarch/strlen-sse2.S: New file.
27 * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set
28 bit_Slow_BSF for Atom.
29 * sysdeps/x86_64/multiarch/init-arch.h (bit_Slow_BSF): Define.
30 (index_Slow_BSF): Define.
31 (HAS_SLOW_BSF): Define.
32
33Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile
34===================================================================
35--- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/Makefile
36+++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile
37@@ -9,7 +9,8 @@ sysdep_routines += bzero-sse2 memset-sse
38 memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
39 memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
40 strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
41- memcmp-ssse3 memcmp-sse4 strcasestr-nonascii
42+ memcmp-ssse3 memcmp-sse4 strcasestr-nonascii \
43+ strlen-sse2 strlen-sse2-bsf
44 ifeq (yes,$(config-cflags-sse4))
45 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
46 CFLAGS-strcspn-c.c += -msse4
47Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
48===================================================================
49--- /dev/null
50+++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
51@@ -0,0 +1,127 @@
52+/* strlen with SSE2 and BSF
53+ Copyright (C) 2010 Free Software Foundation, Inc.
54+ Contributed by Intel Corporation.
55+ This file is part of the GNU C Library.
56+
57+ The GNU C Library is free software; you can redistribute it and/or
58+ modify it under the terms of the GNU Lesser General Public
59+ License as published by the Free Software Foundation; either
60+ version 2.1 of the License, or (at your option) any later version.
61+
62+ The GNU C Library is distributed in the hope that it will be useful,
63+ but WITHOUT ANY WARRANTY; without even the implied warranty of
64+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
65+ Lesser General Public License for more details.
66+
67+ You should have received a copy of the GNU Lesser General Public
68+ License along with the GNU C Library; if not, write to the Free
69+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
70+ 02111-1307 USA. */
71+
72+#if defined SHARED && !defined NOT_IN_libc
73+
74+#include <sysdep.h>
75+#include "asm-syntax.h"
76+
77+#define CFI_PUSH(REG) \
78+ cfi_adjust_cfa_offset (4); \
79+ cfi_rel_offset (REG, 0)
80+
81+#define CFI_POP(REG) \
82+ cfi_adjust_cfa_offset (-4); \
83+ cfi_restore (REG)
84+
85+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
86+#define POP(REG) popl REG; CFI_POP (REG)
87+#define PARMS 4 + 8 /* Preserve ESI and EDI. */
88+#define STR PARMS
89+#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state
90+#define RETURN POP (%edi); POP (%esi); ret; \
91+ cfi_restore_state; cfi_remember_state
92+
93+ .text
94+ENTRY ( __strlen_sse2_bsf)
95+ ENTRANCE
96+ mov STR(%esp), %edi
97+ xor %eax, %eax
98+ mov %edi, %ecx
99+ and $0x3f, %ecx
100+ pxor %xmm0, %xmm0
101+ cmp $0x30, %ecx
102+ ja L(next)
103+ movdqu (%edi), %xmm1
104+ pcmpeqb %xmm1, %xmm0
105+ pmovmskb %xmm0, %edx
106+ test %edx, %edx
107+ jnz L(exit_less16)
108+ mov %edi, %eax
109+ and $-16, %eax
110+ jmp L(align16_start)
111+L(next):
112+
113+ mov %edi, %eax
114+ and $-16, %eax
115+ pcmpeqb (%eax), %xmm0
116+ mov $-1, %esi
117+ sub %eax, %ecx
118+ shl %cl, %esi
119+ pmovmskb %xmm0, %edx
120+ and %esi, %edx
121+ jnz L(exit)
122+L(align16_start):
123+ pxor %xmm0, %xmm0
124+ pxor %xmm1, %xmm1
125+ pxor %xmm2, %xmm2
126+ pxor %xmm3, %xmm3
127+ .p2align 4
128+L(align16_loop):
129+ pcmpeqb 16(%eax), %xmm0
130+ pmovmskb %xmm0, %edx
131+ test %edx, %edx
132+ jnz L(exit16)
133+
134+ pcmpeqb 32(%eax), %xmm1
135+ pmovmskb %xmm1, %edx
136+ test %edx, %edx
137+ jnz L(exit32)
138+
139+ pcmpeqb 48(%eax), %xmm2
140+ pmovmskb %xmm2, %edx
141+ test %edx, %edx
142+ jnz L(exit48)
143+
144+ pcmpeqb 64(%eax), %xmm3
145+ pmovmskb %xmm3, %edx
146+ lea 64(%eax), %eax
147+ test %edx, %edx
148+ jz L(align16_loop)
149+L(exit):
150+ sub %edi, %eax
151+L(exit_less16):
152+ bsf %edx, %edx
153+ add %edx, %eax
154+ RETURN
155+L(exit16):
156+ sub %edi, %eax
157+ bsf %edx, %edx
158+ add %edx, %eax
159+ add $16, %eax
160+ RETURN
161+L(exit32):
162+ sub %edi, %eax
163+ bsf %edx, %edx
164+ add %edx, %eax
165+ add $32, %eax
166+ RETURN
167+L(exit48):
168+ sub %edi, %eax
169+ bsf %edx, %edx
170+ add %edx, %eax
171+ add $48, %eax
172+ POP (%edi)
173+ POP (%esi)
174+ ret
175+
176+END ( __strlen_sse2_bsf)
177+
178+#endif
179Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S
180===================================================================
181--- /dev/null
182+++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S
183@@ -0,0 +1,347 @@
184+/* strlen with SSE2
185+ Copyright (C) 2010 Free Software Foundation, Inc.
186+ Contributed by Intel Corporation.
187+ This file is part of the GNU C Library.
188+
189+ The GNU C Library is free software; you can redistribute it and/or
190+ modify it under the terms of the GNU Lesser General Public
191+ License as published by the Free Software Foundation; either
192+ version 2.1 of the License, or (at your option) any later version.
193+
194+ The GNU C Library is distributed in the hope that it will be useful,
195+ but WITHOUT ANY WARRANTY; without even the implied warranty of
196+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
197+ Lesser General Public License for more details.
198+
199+ You should have received a copy of the GNU Lesser General Public
200+ License along with the GNU C Library; if not, write to the Free
201+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
202+ 02111-1307 USA. */
203+
204+#if defined SHARED && !defined NOT_IN_libc
205+
206+#include <sysdep.h>
207+#include "asm-syntax.h"
208+
209+#define CFI_PUSH(REG) \
210+ cfi_adjust_cfa_offset (4); \
211+ cfi_rel_offset (REG, 0)
212+
213+#define CFI_POP(REG) \
214+ cfi_adjust_cfa_offset (-4); \
215+ cfi_restore (REG)
216+
217+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
218+#define POP(REG) popl REG; CFI_POP (REG)
219+#define PARMS 4
220+#define STR PARMS
221+#define ENTRANCE
222+#define RETURN ret
223+
224+ .text
225+ENTRY (__strlen_sse2)
226+ ENTRANCE
227+ mov STR(%esp), %edx
228+ xor %eax, %eax
229+ cmpb $0, (%edx)
230+ jz L(exit_tail0)
231+ cmpb $0, 1(%edx)
232+ jz L(exit_tail1)
233+ cmpb $0, 2(%edx)
234+ jz L(exit_tail2)
235+ cmpb $0, 3(%edx)
236+ jz L(exit_tail3)
237+ cmpb $0, 4(%edx)
238+ jz L(exit_tail4)
239+ cmpb $0, 5(%edx)
240+ jz L(exit_tail5)
241+ cmpb $0, 6(%edx)
242+ jz L(exit_tail6)
243+ cmpb $0, 7(%edx)
244+ jz L(exit_tail7)
245+ cmpb $0, 8(%edx)
246+ jz L(exit_tail8)
247+ cmpb $0, 9(%edx)
248+ jz L(exit_tail9)
249+ cmpb $0, 10(%edx)
250+ jz L(exit_tail10)
251+ cmpb $0, 11(%edx)
252+ jz L(exit_tail11)
253+ cmpb $0, 12(%edx)
254+ jz L(exit_tail12)
255+ cmpb $0, 13(%edx)
256+ jz L(exit_tail13)
257+ cmpb $0, 14(%edx)
258+ jz L(exit_tail14)
259+ cmpb $0, 15(%edx)
260+ jz L(exit_tail15)
261+ pxor %xmm0, %xmm0
262+ mov %edx, %eax
263+ mov %edx, %ecx
264+ and $-16, %eax
265+ add $16, %ecx
266+ add $16, %eax
267+
268+ pcmpeqb (%eax), %xmm0
269+ pmovmskb %xmm0, %edx
270+ pxor %xmm1, %xmm1
271+ test %edx, %edx
272+ lea 16(%eax), %eax
273+ jnz L(exit)
274+
275+ pcmpeqb (%eax), %xmm1
276+ pmovmskb %xmm1, %edx
277+ pxor %xmm2, %xmm2
278+ test %edx, %edx
279+ lea 16(%eax), %eax
280+ jnz L(exit)
281+
282+
283+ pcmpeqb (%eax), %xmm2
284+ pmovmskb %xmm2, %edx
285+ pxor %xmm3, %xmm3
286+ test %edx, %edx
287+ lea 16(%eax), %eax
288+ jnz L(exit)
289+
290+ pcmpeqb (%eax), %xmm3
291+ pmovmskb %xmm3, %edx
292+ test %edx, %edx
293+ lea 16(%eax), %eax
294+ jnz L(exit)
295+
296+ pcmpeqb (%eax), %xmm0
297+ pmovmskb %xmm0, %edx
298+ test %edx, %edx
299+ lea 16(%eax), %eax
300+ jnz L(exit)
301+
302+ pcmpeqb (%eax), %xmm1
303+ pmovmskb %xmm1, %edx
304+ test %edx, %edx
305+ lea 16(%eax), %eax
306+ jnz L(exit)
307+
308+ pcmpeqb (%eax), %xmm2
309+ pmovmskb %xmm2, %edx
310+ test %edx, %edx
311+ lea 16(%eax), %eax
312+ jnz L(exit)
313+
314+ pcmpeqb (%eax), %xmm3
315+ pmovmskb %xmm3, %edx
316+ test %edx, %edx
317+ lea 16(%eax), %eax
318+ jnz L(exit)
319+
320+ pcmpeqb (%eax), %xmm0
321+ pmovmskb %xmm0, %edx
322+ test %edx, %edx
323+ lea 16(%eax), %eax
324+ jnz L(exit)
325+
326+ pcmpeqb (%eax), %xmm1
327+ pmovmskb %xmm1, %edx
328+ test %edx, %edx
329+ lea 16(%eax), %eax
330+ jnz L(exit)
331+
332+ pcmpeqb (%eax), %xmm2
333+ pmovmskb %xmm2, %edx
334+ test %edx, %edx
335+ lea 16(%eax), %eax
336+ jnz L(exit)
337+
338+ pcmpeqb (%eax), %xmm3
339+ pmovmskb %xmm3, %edx
340+ test %edx, %edx
341+ lea 16(%eax), %eax
342+ jnz L(exit)
343+
344+ pcmpeqb (%eax), %xmm0
345+ pmovmskb %xmm0, %edx
346+ test %edx, %edx
347+ lea 16(%eax), %eax
348+ jnz L(exit)
349+
350+ pcmpeqb (%eax), %xmm1
351+ pmovmskb %xmm1, %edx
352+ test %edx, %edx
353+ lea 16(%eax), %eax
354+ jnz L(exit)
355+
356+ pcmpeqb (%eax), %xmm2
357+ pmovmskb %xmm2, %edx
358+ test %edx, %edx
359+ lea 16(%eax), %eax
360+ jnz L(exit)
361+
362+ pcmpeqb (%eax), %xmm3
363+ pmovmskb %xmm3, %edx
364+ test %edx, %edx
365+ lea 16(%eax), %eax
366+ jnz L(exit)
367+
368+ and $-0x40, %eax
369+ PUSH (%esi)
370+ PUSH (%edi)
371+ PUSH (%ebx)
372+ PUSH (%ebp)
373+ xor %ebp, %ebp
374+L(aligned_64):
375+ pcmpeqb (%eax), %xmm0
376+ pcmpeqb 16(%eax), %xmm1
377+ pcmpeqb 32(%eax), %xmm2
378+ pcmpeqb 48(%eax), %xmm3
379+ pmovmskb %xmm0, %edx
380+ pmovmskb %xmm1, %esi
381+ pmovmskb %xmm2, %edi
382+ pmovmskb %xmm3, %ebx
383+ or %edx, %ebp
384+ or %esi, %ebp
385+ or %edi, %ebp
386+ or %ebx, %ebp
387+ lea 64(%eax), %eax
388+ jz L(aligned_64)
389+L(48leave):
390+ test %edx, %edx
391+ jnz L(aligned_64_exit_16)
392+ test %esi, %esi
393+ jnz L(aligned_64_exit_32)
394+ test %edi, %edi
395+ jnz L(aligned_64_exit_48)
396+ mov %ebx, %edx
397+ lea (%eax), %eax
398+ jmp L(aligned_64_exit)
399+L(aligned_64_exit_48):
400+ lea -16(%eax), %eax
401+ mov %edi, %edx
402+ jmp L(aligned_64_exit)
403+L(aligned_64_exit_32):
404+ lea -32(%eax), %eax
405+ mov %esi, %edx
406+ jmp L(aligned_64_exit)
407+L(aligned_64_exit_16):
408+ lea -48(%eax), %eax
409+L(aligned_64_exit):
410+ POP (%ebp)
411+ POP (%ebx)
412+ POP (%edi)
413+ POP (%esi)
414+L(exit):
415+ sub %ecx, %eax
416+ test %dl, %dl
417+ jz L(exit_high)
418+ test $0x01, %dl
419+ jnz L(exit_tail0)
420+
421+ test $0x02, %dl
422+ jnz L(exit_tail1)
423+
424+ test $0x04, %dl
425+ jnz L(exit_tail2)
426+
427+ test $0x08, %dl
428+ jnz L(exit_tail3)
429+
430+ test $0x10, %dl
431+ jnz L(exit_tail4)
432+
433+ test $0x20, %dl
434+ jnz L(exit_tail5)
435+
436+ test $0x40, %dl
437+ jnz L(exit_tail6)
438+ add $7, %eax
439+L(exit_tail0):
440+ RETURN
441+
442+L(exit_high):
443+ add $8, %eax
444+ test $0x01, %dh
445+ jnz L(exit_tail0)
446+
447+ test $0x02, %dh
448+ jnz L(exit_tail1)
449+
450+ test $0x04, %dh
451+ jnz L(exit_tail2)
452+
453+ test $0x08, %dh
454+ jnz L(exit_tail3)
455+
456+ test $0x10, %dh
457+ jnz L(exit_tail4)
458+
459+ test $0x20, %dh
460+ jnz L(exit_tail5)
461+
462+ test $0x40, %dh
463+ jnz L(exit_tail6)
464+ add $7, %eax
465+ RETURN
466+
467+ .p2align 4
468+L(exit_tail1):
469+ add $1, %eax
470+ RETURN
471+
472+L(exit_tail2):
473+ add $2, %eax
474+ RETURN
475+
476+L(exit_tail3):
477+ add $3, %eax
478+ RETURN
479+
480+L(exit_tail4):
481+ add $4, %eax
482+ RETURN
483+
484+L(exit_tail5):
485+ add $5, %eax
486+ RETURN
487+
488+L(exit_tail6):
489+ add $6, %eax
490+ RETURN
491+
492+L(exit_tail7):
493+ add $7, %eax
494+ RETURN
495+
496+L(exit_tail8):
497+ add $8, %eax
498+ RETURN
499+
500+L(exit_tail9):
501+ add $9, %eax
502+ RETURN
503+
504+L(exit_tail10):
505+ add $10, %eax
506+ RETURN
507+
508+L(exit_tail11):
509+ add $11, %eax
510+ RETURN
511+
512+L(exit_tail12):
513+ add $12, %eax
514+ RETURN
515+
516+L(exit_tail13):
517+ add $13, %eax
518+ RETURN
519+
520+L(exit_tail14):
521+ add $14, %eax
522+ RETURN
523+
524+L(exit_tail15):
525+ add $15, %eax
526+ ret
527+
528+END (__strlen_sse2)
529+
530+#endif
531Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S
532===================================================================
533--- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/strlen.S
534+++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S
535@@ -48,6 +48,9 @@ ENTRY(strlen)
536 1: leal __strlen_ia32@GOTOFF(%ebx), %eax
537 testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
538 jz 2f
539+ leal __strlen_sse2_bsf@GOTOFF(%ebx), %eax
540+ testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
541+ jz 2f
542 leal __strlen_sse2@GOTOFF(%ebx), %eax
543 2: popl %ebx
544 cfi_adjust_cfa_offset (-4);
545@@ -55,84 +58,6 @@ ENTRY(strlen)
546 ret
547 END(strlen)
548
549-#define CFI_POP(REG) \
550- cfi_adjust_cfa_offset (-4); \
551- cfi_restore (REG)
552-
553-#define RETURN popl %esi; CFI_POP (esi); ret
554-
555- .text
556-ENTRY (__strlen_sse2)
557-/*
558- * This implementation uses SSE instructions to compare up to 16 bytes
559- * at a time looking for the end of string (null char).
560- */
561- pushl %esi
562- cfi_adjust_cfa_offset (4)
563- cfi_rel_offset (%esi, 0)
564- mov 8(%esp), %eax
565- mov %eax, %ecx
566- pxor %xmm0, %xmm0 /* 16 null chars */
567- mov %eax, %esi
568- and $15, %ecx
569- jz 1f /* string is 16 byte aligned */
570-
571- /*
572- * Unaligned case. Round down to 16-byte boundary before comparing
573- * 16 bytes for a null char. The code then compensates for any extra chars
574- * preceding the start of the string.
575- */
576- and $-16, %esi
577-
578- pcmpeqb (%esi), %xmm0
579- lea 16(%eax), %esi
580- pmovmskb %xmm0, %edx
581-
582- shr %cl, %edx /* Compensate for bytes preceding the string */
583- test %edx, %edx
584- jnz 2f
585- sub %ecx, %esi /* no null, adjust to next 16-byte boundary */
586- pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
587-
588- .p2align 4
589-1: /* 16 byte aligned */
590- pcmpeqb (%esi), %xmm0 /* look for null bytes */
591- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
592-
593- add $16, %esi /* prepare to search next 16 bytes */
594- test %edx, %edx /* if no null byte, %edx must be 0 */
595- jnz 2f /* found a null */
596-
597- pcmpeqb (%esi), %xmm0
598- pmovmskb %xmm0, %edx
599- add $16, %esi
600- test %edx, %edx
601- jnz 2f
602-
603- pcmpeqb (%esi), %xmm0
604- pmovmskb %xmm0, %edx
605- add $16, %esi
606- test %edx, %edx
607- jnz 2f
608-
609- pcmpeqb (%esi), %xmm0
610- pmovmskb %xmm0, %edx
611- add $16, %esi
612- test %edx, %edx
613- jz 1b
614-
615-2:
616- neg %eax
617- lea -16(%eax, %esi), %eax /* calculate exact offset */
618- bsf %edx, %ecx /* Least significant 1 bit is index of null */
619- add %ecx, %eax
620- popl %esi
621- cfi_adjust_cfa_offset (-4)
622- cfi_restore (%esi)
623- ret
624-
625-END (__strlen_sse2)
626-
627 # undef ENTRY
628 # define ENTRY(name) \
629 .type __strlen_ia32, @function; \
630--- a/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:43:30.060487726 -0700
631+++ b/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:45:57.894692115 -0700
632@@ -7,7 +7,7 @@ ifeq ($(subdir),string)
633 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
634 strend-sse4 memcmp-sse4 \
635 strcasestr-nonascii strcasecmp_l-ssse3 \
636- strncase_l-ssse3 \
637+ strncase_l-ssse3 strlen-sse4 strlen-no-bsf \
638 memset-x86-64
639 ifeq (yes,$(config-cflags-sse4))
640 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
641Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c
642===================================================================
643--- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/init-arch.c
644+++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c
645@@ -77,6 +77,12 @@ __init_cpu_features (void)
646 model += extended_model;
647 switch (model)
648 {
649+ case 0x1c:
650+ case 0x26:
651+ /* BSF is slow on Atom. */
652+ __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
653+ break;
654+
655 case 0x1a:
656 case 0x1e:
657 case 0x1f:
658--- a/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:43:30.061487720 -0700
659+++ b/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:48:13.371963005 -0700
660@@ -17,6 +17,7 @@
661 02111-1307 USA. */
662
663 #define bit_Fast_Rep_String (1 << 0)
664+#define bit_Slow_BSF (1 << 2)
665 #define bit_Prefer_SSE_for_memop (1 << 3)
666
667 #ifdef __ASSEMBLER__
668@@ -34,6 +35,7 @@
669 # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
670
671 #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
672+# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE
673 # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE
674
675 #else /* __ASSEMBLER__ */
676@@ -105,11 +107,15 @@ extern const struct cpu_features *__get_
677 # define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
678
679 # define index_Fast_Rep_String FEATURE_INDEX_1
680+# define index_Slow_BSF FEATURE_INDEX_1
681 # define index_Prefer_SSE_for_memop FEATURE_INDEX_1
682
683 #define HAS_ARCH_FEATURE(idx, bit) \
684 ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
685
686+#define HAS_SLOW_BSF \
687+ HAS_ARCH_FEATURE (index_Slow_BSF, bit_Slow_BSF)
688+
689 #define HAS_PREFER_SSE_FOR_MEMOP \
690 HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop)
691
692Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S
693===================================================================
694--- /dev/null
695+++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S
696@@ -0,0 +1,309 @@
697+/* strlen without BSF
698+ Copyright (C) 2010 Free Software Foundation, Inc.
699+ Contributed by Intel Corporation.
700+ This file is part of the GNU C Library.
701+
702+ The GNU C Library is free software; you can redistribute it and/or
703+ modify it under the terms of the GNU Lesser General Public
704+ License as published by the Free Software Foundation; either
705+ version 2.1 of the License, or (at your option) any later version.
706+
707+ The GNU C Library is distributed in the hope that it will be useful,
708+ but WITHOUT ANY WARRANTY; without even the implied warranty of
709+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
710+ Lesser General Public License for more details.
711+
712+ You should have received a copy of the GNU Lesser General Public
713+ License along with the GNU C Library; if not, write to the Free
714+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
715+ 02111-1307 USA. */
716+
717+#if defined SHARED && !defined NOT_IN_libc
718+
719+#include <sysdep.h>
720+
721+ .section .text.slow,"ax",@progbits
722+ENTRY (__strlen_no_bsf)
723+ xor %eax, %eax
724+ cmpb $0, (%rdi)
725+ jz L(exit_tail0)
726+ cmpb $0, 1(%rdi)
727+ jz L(exit_tail1)
728+ cmpb $0, 2(%rdi)
729+ jz L(exit_tail2)
730+ cmpb $0, 3(%rdi)
731+ jz L(exit_tail3)
732+ cmpb $0, 4(%rdi)
733+ jz L(exit_tail4)
734+ cmpb $0, 5(%rdi)
735+ jz L(exit_tail5)
736+ cmpb $0, 6(%rdi)
737+ jz L(exit_tail6)
738+ cmpb $0, 7(%rdi)
739+ jz L(exit_tail7)
740+ cmpb $0, 8(%rdi)
741+ jz L(exit_tail8)
742+ cmpb $0, 9(%rdi)
743+ jz L(exit_tail9)
744+ cmpb $0, 10(%rdi)
745+ jz L(exit_tail10)
746+ cmpb $0, 11(%rdi)
747+ jz L(exit_tail11)
748+ cmpb $0, 12(%rdi)
749+ jz L(exit_tail12)
750+ cmpb $0, 13(%rdi)
751+ jz L(exit_tail13)
752+ cmpb $0, 14(%rdi)
753+ jz L(exit_tail14)
754+ cmpb $0, 15(%rdi)
755+ jz L(exit_tail15)
756+ pxor %xmm0, %xmm0
757+ mov %rdi, %rcx
758+ mov %rdi, %rax
759+ and $-16, %rax
760+ add $16, %rax
761+ add $16, %rcx
762+
763+ pcmpeqb (%rax), %xmm0
764+ pmovmskb %xmm0, %edx
765+ pxor %xmm1, %xmm1
766+ test %edx, %edx
767+ lea 16(%rax), %rax
768+ jnz L(exit)
769+
770+ pcmpeqb (%rax), %xmm1
771+ pmovmskb %xmm1, %edx
772+ pxor %xmm2, %xmm2
773+ test %edx, %edx
774+ lea 16(%rax), %rax
775+ jnz L(exit)
776+
777+
778+ pcmpeqb (%rax), %xmm2
779+ pmovmskb %xmm2, %edx
780+ pxor %xmm3, %xmm3
781+ test %edx, %edx
782+ lea 16(%rax), %rax
783+ jnz L(exit)
784+
785+ pcmpeqb (%rax), %xmm3
786+ pmovmskb %xmm3, %edx
787+ test %edx, %edx
788+ lea 16(%rax), %rax
789+ jnz L(exit)
790+
791+ pcmpeqb (%rax), %xmm0
792+ pmovmskb %xmm0, %edx
793+ test %edx, %edx
794+ lea 16(%rax), %rax
795+ jnz L(exit)
796+
797+ pcmpeqb (%rax), %xmm1
798+ pmovmskb %xmm1, %edx
799+ test %edx, %edx
800+ lea 16(%rax), %rax
801+ jnz L(exit)
802+
803+ pcmpeqb (%rax), %xmm2
804+ pmovmskb %xmm2, %edx
805+ test %edx, %edx
806+ lea 16(%rax), %rax
807+ jnz L(exit)
808+
809+ pcmpeqb (%rax), %xmm3
810+ pmovmskb %xmm3, %edx
811+ test %edx, %edx
812+ lea 16(%rax), %rax
813+ jnz L(exit)
814+
815+ pcmpeqb (%rax), %xmm0
816+ pmovmskb %xmm0, %edx
817+ test %edx, %edx
818+ lea 16(%rax), %rax
819+ jnz L(exit)
820+
821+ pcmpeqb (%rax), %xmm1
822+ pmovmskb %xmm1, %edx
823+ test %edx, %edx
824+ lea 16(%rax), %rax
825+ jnz L(exit)
826+
827+ pcmpeqb (%rax), %xmm2
828+ pmovmskb %xmm2, %edx
829+ test %edx, %edx
830+ lea 16(%rax), %rax
831+ jnz L(exit)
832+
833+ pcmpeqb (%rax), %xmm3
834+ pmovmskb %xmm3, %edx
835+ test %edx, %edx
836+ lea 16(%rax), %rax
837+ jnz L(exit)
838+
839+ pcmpeqb (%rax), %xmm0
840+ pmovmskb %xmm0, %edx
841+ test %edx, %edx
842+ lea 16(%rax), %rax
843+ jnz L(exit)
844+
845+ pcmpeqb (%rax), %xmm1
846+ pmovmskb %xmm1, %edx
847+ test %edx, %edx
848+ lea 16(%rax), %rax
849+ jnz L(exit)
850+
851+ pcmpeqb (%rax), %xmm2
852+ pmovmskb %xmm2, %edx
853+ test %edx, %edx
854+ lea 16(%rax), %rax
855+ jnz L(exit)
856+
857+ pcmpeqb (%rax), %xmm3
858+ pmovmskb %xmm3, %edx
859+ test %edx, %edx
860+ lea 16(%rax), %rax
861+ jnz L(exit)
862+
863+ and $-0x40, %rax
864+ xor %r8d, %r8d
865+L(aligned_64):
866+ pcmpeqb (%rax), %xmm0
867+ pcmpeqb 16(%rax), %xmm1
868+ pcmpeqb 32(%rax), %xmm2
869+ pcmpeqb 48(%rax), %xmm3
870+ pmovmskb %xmm0, %edx
871+ pmovmskb %xmm1, %esi
872+ pmovmskb %xmm2, %edi
873+ pmovmskb %xmm3, %r9d
874+ or %edx, %r8d
875+ or %esi, %r8d
876+ or %edi, %r8d
877+ or %r9d, %r8d
878+ lea 64(%rax), %rax
879+ jz L(aligned_64)
880+
881+ test %edx, %edx
882+ jnz L(aligned_64_exit_16)
883+ test %esi, %esi
884+ jnz L(aligned_64_exit_32)
885+ test %edi, %edi
886+ jnz L(aligned_64_exit_48)
887+L(aligned_64_exit_64):
888+ mov %r9d, %edx
889+ jmp L(aligned_64_exit)
890+L(aligned_64_exit_48):
891+ lea -16(%rax), %rax
892+ mov %edi, %edx
893+ jmp L(aligned_64_exit)
894+L(aligned_64_exit_32):
895+ lea -32(%rax), %rax
896+ mov %esi, %edx
897+ jmp L(aligned_64_exit)
898+L(aligned_64_exit_16):
899+ lea -48(%rax), %rax
900+L(aligned_64_exit):
901+L(exit):
902+ sub %rcx, %rax
903+ test %dl, %dl
904+ jz L(exit_high)
905+ test $0x01, %dl
906+ jnz L(exit_tail0)
907+
908+ test $0x02, %dl
909+ jnz L(exit_tail1)
910+
911+ test $0x04, %dl
912+ jnz L(exit_tail2)
913+
914+ test $0x08, %dl
915+ jnz L(exit_tail3)
916+
917+ test $0x10, %dl
918+ jnz L(exit_tail4)
919+
920+ test $0x20, %dl
921+ jnz L(exit_tail5)
922+
923+ test $0x40, %dl
924+ jnz L(exit_tail6)
925+ add $7, %eax
926+L(exit_tail0):
927+ ret
928+
929+L(exit_high):
930+ add $8, %eax
931+ test $0x01, %dh
932+ jnz L(exit_tail0)
933+
934+ test $0x02, %dh
935+ jnz L(exit_tail1)
936+
937+ test $0x04, %dh
938+ jnz L(exit_tail2)
939+
940+ test $0x08, %dh
941+ jnz L(exit_tail3)
942+
943+ test $0x10, %dh
944+ jnz L(exit_tail4)
945+
946+ test $0x20, %dh
947+ jnz L(exit_tail5)
948+
949+ test $0x40, %dh
950+ jnz L(exit_tail6)
951+ add $7, %eax
952+ ret
953+ .p2align 4
954+L(exit_tail1):
955+ add $1, %eax
956+ ret
957+
958+L(exit_tail2):
959+ add $2, %eax
960+ ret
961+
962+L(exit_tail3):
963+ add $3, %eax
964+ ret
965+
966+L(exit_tail4):
967+ add $4, %eax
968+ ret
969+
970+L(exit_tail5):
971+ add $5, %eax
972+ ret
973+L(exit_tail6):
974+ add $6, %eax
975+ ret
976+L(exit_tail7):
977+ add $7, %eax
978+ ret
979+L(exit_tail8):
980+ add $8, %eax
981+ ret
982+L(exit_tail9):
983+ add $9, %eax
984+ ret
985+L(exit_tail10):
986+ add $10, %eax
987+ ret
988+L(exit_tail11):
989+ add $11, %eax
990+ ret
991+L(exit_tail12):
992+ add $12, %eax
993+ ret
994+L(exit_tail13):
995+ add $13, %eax
996+ ret
997+L(exit_tail14):
998+ add $14, %eax
999+ ret
1000+L(exit_tail15):
1001+ add $15, %eax
1002+ ret
1003+END (__strlen_no_bsf)
1004+
1005+#endif
1006Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S
1007===================================================================
1008--- /dev/null
1009+++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S
1010@@ -0,0 +1,85 @@
1011+/* strlen with SSE4
1012+ Copyright (C) 2009, 2010 Free Software Foundation, Inc.
1013+ Contributed by Ulrich Drepper <drepper@redhat.com>.
1014+ This file is part of the GNU C Library.
1015+
1016+ The GNU C Library is free software; you can redistribute it and/or
1017+ modify it under the terms of the GNU Lesser General Public
1018+ License as published by the Free Software Foundation; either
1019+ version 2.1 of the License, or (at your option) any later version.
1020+
1021+ The GNU C Library is distributed in the hope that it will be useful,
1022+ but WITHOUT ANY WARRANTY; without even the implied warranty of
1023+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1024+ Lesser General Public License for more details.
1025+
1026+ You should have received a copy of the GNU Lesser General Public
1027+ License along with the GNU C Library; if not, write to the Free
1028+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1029+ 02111-1307 USA. */
1030+
1031+#if defined SHARED && !defined NOT_IN_libc
1032+
1033+#include <sysdep.h>
1034+
1035+ .section .text.sse4.2,"ax",@progbits
1036+ENTRY (__strlen_sse42)
1037+ pxor %xmm1, %xmm1
1038+ movl %edi, %ecx
1039+ movq %rdi, %r8
1040+ andq $~15, %rdi
1041+ xor %edi, %ecx
1042+ pcmpeqb (%rdi), %xmm1
1043+ pmovmskb %xmm1, %edx
1044+ shrl %cl, %edx
1045+ shll %cl, %edx
1046+ andl %edx, %edx
1047+ jnz L(less16bytes)
1048+ pxor %xmm1, %xmm1
1049+
1050+ .p2align 4
1051+L(more64bytes_loop):
1052+ pcmpistri $0x08, 16(%rdi), %xmm1
1053+ jz L(more32bytes)
1054+
1055+ pcmpistri $0x08, 32(%rdi), %xmm1
1056+ jz L(more48bytes)
1057+
1058+ pcmpistri $0x08, 48(%rdi), %xmm1
1059+ jz L(more64bytes)
1060+
1061+ add $64, %rdi
1062+ pcmpistri $0x08, (%rdi), %xmm1
1063+ jnz L(more64bytes_loop)
1064+ leaq (%rdi,%rcx), %rax
1065+ subq %r8, %rax
1066+ ret
1067+
1068+ .p2align 4
1069+L(more32bytes):
1070+ leaq 16(%rdi,%rcx, 1), %rax
1071+ subq %r8, %rax
1072+ ret
1073+
1074+ .p2align 4
1075+L(more48bytes):
1076+ leaq 32(%rdi,%rcx, 1), %rax
1077+ subq %r8, %rax
1078+ ret
1079+
1080+ .p2align 4
1081+L(more64bytes):
1082+ leaq 48(%rdi,%rcx, 1), %rax
1083+ subq %r8, %rax
1084+ ret
1085+
1086+ .p2align 4
1087+L(less16bytes):
1088+ subq %r8, %rdi
1089+ bsfl %edx, %eax
1090+ addq %rdi, %rax
1091+ ret
1092+
1093+END (__strlen_sse42)
1094+
1095+#endif
1096Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S
1097===================================================================
1098--- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/strlen.S
1099+++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S
1100@@ -36,74 +36,12 @@ ENTRY(strlen)
1101 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
1102 jz 2f
1103 leaq __strlen_sse42(%rip), %rax
1104-2: ret
1105-END(strlen)
1106-
1107-
1108- .section .text.sse4.2,"ax",@progbits
1109- .align 16
1110- .type __strlen_sse42, @function
1111-__strlen_sse42:
1112- cfi_startproc
1113- CALL_MCOUNT
1114- pxor %xmm1, %xmm1
1115- movl %edi, %ecx
1116- movq %rdi, %r8
1117- andq $~15, %rdi
1118- xor %edi, %ecx
1119- pcmpeqb (%rdi), %xmm1
1120- pmovmskb %xmm1, %edx
1121- shrl %cl, %edx
1122- shll %cl, %edx
1123- andl %edx, %edx
1124- jnz L(less16bytes)
1125- pxor %xmm1, %xmm1
1126-
1127- .p2align 4
1128-L(more64bytes_loop):
1129- pcmpistri $0x08, 16(%rdi), %xmm1
1130- jz L(more32bytes)
1131-
1132- pcmpistri $0x08, 32(%rdi), %xmm1
1133- jz L(more48bytes)
1134-
1135- pcmpistri $0x08, 48(%rdi), %xmm1
1136- jz L(more64bytes)
1137-
1138- add $64, %rdi
1139- pcmpistri $0x08, (%rdi), %xmm1
1140- jnz L(more64bytes_loop)
1141- leaq (%rdi,%rcx), %rax
1142- subq %r8, %rax
1143- ret
1144-
1145- .p2align 4
1146-L(more32bytes):
1147- leaq 16(%rdi,%rcx, 1), %rax
1148- subq %r8, %rax
1149- ret
1150-
1151- .p2align 4
1152-L(more48bytes):
1153- leaq 32(%rdi,%rcx, 1), %rax
1154- subq %r8, %rax
1155- ret
1156-
1157- .p2align 4
1158-L(more64bytes):
1159- leaq 48(%rdi,%rcx, 1), %rax
1160- subq %r8, %rax
1161 ret
1162-
1163- .p2align 4
1164-L(less16bytes):
1165- subq %r8, %rdi
1166- bsfl %edx, %eax
1167- addq %rdi, %rax
1168- ret
1169- cfi_endproc
1170- .size __strlen_sse42, .-__strlen_sse42
1171-
1172+2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
1173+ jz 3f
1174+ leaq __strlen_no_bsf(%rip), %rax
1175+3: ret
1176+END(strlen)
1177
1178 # undef ENTRY
1179 # define ENTRY(name) \
1180Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S
1181===================================================================
1182--- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/strlen.S
1183+++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S
1184@@ -23,29 +23,80 @@
1185
1186 .text
1187 ENTRY(strlen)
1188- pxor %xmm2, %xmm2
1189- movq %rdi, %rcx
1190- movq %rdi, %r8
1191- andq $~15, %rdi
1192- movdqa %xmm2, %xmm1
1193- pcmpeqb (%rdi), %xmm2
1194- orl $0xffffffff, %esi
1195- subq %rdi, %rcx
1196- shll %cl, %esi
1197- pmovmskb %xmm2, %edx
1198- andl %esi, %edx
1199- jnz 1f
1200-
1201-2: movdqa 16(%rdi), %xmm0
1202- leaq 16(%rdi), %rdi
1203+ xor %rax, %rax
1204+ mov %edi, %ecx
1205+ and $0x3f, %ecx
1206+ pxor %xmm0, %xmm0
1207+ cmp $0x30, %ecx
1208+ ja L(next)
1209+ movdqu (%rdi), %xmm1
1210 pcmpeqb %xmm1, %xmm0
1211 pmovmskb %xmm0, %edx
1212- testl %edx, %edx
1213- jz 2b
1214+ test %edx, %edx
1215+ jnz L(exit_less16)
1216+ mov %rdi, %rax
1217+ and $-16, %rax
1218+ jmp L(align16_start)
1219+L(next):
1220+ mov %rdi, %rax
1221+ and $-16, %rax
1222+ pcmpeqb (%rax), %xmm0
1223+ mov $-1, %esi
1224+ sub %rax, %rcx
1225+ shl %cl, %esi
1226+ pmovmskb %xmm0, %edx
1227+ and %esi, %edx
1228+ jnz L(exit)
1229+L(align16_start):
1230+ pxor %xmm0, %xmm0
1231+ pxor %xmm1, %xmm1
1232+ pxor %xmm2, %xmm2
1233+ pxor %xmm3, %xmm3
1234+ .p2align 4
1235+L(align16_loop):
1236+ pcmpeqb 16(%rax), %xmm0
1237+ pmovmskb %xmm0, %edx
1238+ test %edx, %edx
1239+ jnz L(exit16)
1240
1241-1: subq %r8, %rdi
1242- bsfl %edx, %eax
1243- addq %rdi, %rax
1244+ pcmpeqb 32(%rax), %xmm1
1245+ pmovmskb %xmm1, %edx
1246+ test %edx, %edx
1247+ jnz L(exit32)
1248+
1249+ pcmpeqb 48(%rax), %xmm2
1250+ pmovmskb %xmm2, %edx
1251+ test %edx, %edx
1252+ jnz L(exit48)
1253+
1254+ pcmpeqb 64(%rax), %xmm3
1255+ pmovmskb %xmm3, %edx
1256+ lea 64(%rax), %rax
1257+ test %edx, %edx
1258+ jz L(align16_loop)
1259+L(exit):
1260+ sub %rdi, %rax
1261+L(exit_less16):
1262+ bsf %rdx, %rdx
1263+ add %rdx, %rax
1264+ ret
1265+ .p2align 4
1266+L(exit16):
1267+ sub %rdi, %rax
1268+ bsf %rdx, %rdx
1269+ lea 16(%rdx,%rax), %rax
1270+ ret
1271+ .p2align 4
1272+L(exit32):
1273+ sub %rdi, %rax
1274+ bsf %rdx, %rdx
1275+ lea 32(%rdx,%rax), %rax
1276+ ret
1277+ .p2align 4
1278+L(exit48):
1279+ sub %rdi, %rax
1280+ bsf %rdx, %rdx
1281+ lea 48(%rdx,%rax), %rax
1282 ret
1283 END(strlen)
1284 libc_hidden_builtin_def (strlen)