]>
Commit | Line | Data |
---|---|---|
12788f63 MT |
1 | 2010-08-27 Ulrich Drepper <drepper@redhat.com> |
2 | ||
3 | * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Move to .text.slow section. | |
4 | ||
5 | * sysdeps/x86_64/strlen.S: Minimal code improvement. | |
6 | ||
7 | 2010-08-26 H.J. Lu <hongjiu.lu@intel.com> | |
8 | ||
9 | * sysdeps/x86_64/strlen.S: Unroll the loop. | |
10 | * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add | |
11 | strlen-sse2 strlen-sse2-bsf. | |
12 | * sysdeps/x86_64/multiarch/strlen.S ((strlen): Return | |
13 | __strlen_no_bsf if bit_Slow_BSF is set. | |
14 | (__strlen_sse42): Removed. | |
15 | * sysdeps/x86_64/multiarch/strlen-no-bsf.S: New file. | |
16 | * sysdeps/x86_64/multiarch/strlen-sse4.S: New file. | |
17 | ||
18 | 2010-08-25 H.J. Lu <hongjiu.lu@intel.com> | |
19 | ||
20 | * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add | |
21 | strlen-sse2 strlen-sse2-bsf. | |
22 | * sysdeps/i386/i686/multiarch/strlen.S (strlen): Return | |
23 | __strlen_sse2_bsf if bit_Slow_BSF is unset. | |
24 | (__strlen_sse2): Removed. | |
25 | * sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S: New file. | |
26 | * sysdeps/i386/i686/multiarch/strlen-sse2.S: New file. | |
27 | * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set | |
28 | bit_Slow_BSF for Atom. | |
29 | * sysdeps/x86_64/multiarch/init-arch.h (bit_Slow_BSF): Define. | |
30 | (index_Slow_BSF): Define. | |
31 | (HAS_SLOW_BSF): Define. | |
32 | ||
33 | Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile | |
34 | =================================================================== | |
35 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/Makefile | |
36 | +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/Makefile | |
37 | @@ -9,7 +9,8 @@ sysdep_routines += bzero-sse2 memset-sse | |
38 | memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ | |
39 | memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ | |
40 | strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ | |
41 | - memcmp-ssse3 memcmp-sse4 strcasestr-nonascii | |
42 | + memcmp-ssse3 memcmp-sse4 strcasestr-nonascii \ | |
43 | + strlen-sse2 strlen-sse2-bsf | |
44 | ifeq (yes,$(config-cflags-sse4)) | |
45 | sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c | |
46 | CFLAGS-strcspn-c.c += -msse4 | |
47 | Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S | |
48 | =================================================================== | |
49 | --- /dev/null | |
50 | +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S | |
51 | @@ -0,0 +1,127 @@ | |
52 | +/* strlen with SSE2 and BSF | |
53 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
54 | + Contributed by Intel Corporation. | |
55 | + This file is part of the GNU C Library. | |
56 | + | |
57 | + The GNU C Library is free software; you can redistribute it and/or | |
58 | + modify it under the terms of the GNU Lesser General Public | |
59 | + License as published by the Free Software Foundation; either | |
60 | + version 2.1 of the License, or (at your option) any later version. | |
61 | + | |
62 | + The GNU C Library is distributed in the hope that it will be useful, | |
63 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
64 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
65 | + Lesser General Public License for more details. | |
66 | + | |
67 | + You should have received a copy of the GNU Lesser General Public | |
68 | + License along with the GNU C Library; if not, write to the Free | |
69 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
70 | + 02111-1307 USA. */ | |
71 | + | |
72 | +#if defined SHARED && !defined NOT_IN_libc | |
73 | + | |
74 | +#include <sysdep.h> | |
75 | +#include "asm-syntax.h" | |
76 | + | |
77 | +#define CFI_PUSH(REG) \ | |
78 | + cfi_adjust_cfa_offset (4); \ | |
79 | + cfi_rel_offset (REG, 0) | |
80 | + | |
81 | +#define CFI_POP(REG) \ | |
82 | + cfi_adjust_cfa_offset (-4); \ | |
83 | + cfi_restore (REG) | |
84 | + | |
85 | +#define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
86 | +#define POP(REG) popl REG; CFI_POP (REG) | |
87 | +#define PARMS 4 + 8 /* Preserve ESI and EDI. */ | |
88 | +#define STR PARMS | |
89 | +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state | |
90 | +#define RETURN POP (%edi); POP (%esi); ret; \ | |
91 | + cfi_restore_state; cfi_remember_state | |
92 | + | |
93 | + .text | |
94 | +ENTRY ( __strlen_sse2_bsf) | |
95 | + ENTRANCE | |
96 | + mov STR(%esp), %edi | |
97 | + xor %eax, %eax | |
98 | + mov %edi, %ecx | |
99 | + and $0x3f, %ecx | |
100 | + pxor %xmm0, %xmm0 | |
101 | + cmp $0x30, %ecx | |
102 | + ja L(next) | |
103 | + movdqu (%edi), %xmm1 | |
104 | + pcmpeqb %xmm1, %xmm0 | |
105 | + pmovmskb %xmm0, %edx | |
106 | + test %edx, %edx | |
107 | + jnz L(exit_less16) | |
108 | + mov %edi, %eax | |
109 | + and $-16, %eax | |
110 | + jmp L(align16_start) | |
111 | +L(next): | |
112 | + | |
113 | + mov %edi, %eax | |
114 | + and $-16, %eax | |
115 | + pcmpeqb (%eax), %xmm0 | |
116 | + mov $-1, %esi | |
117 | + sub %eax, %ecx | |
118 | + shl %cl, %esi | |
119 | + pmovmskb %xmm0, %edx | |
120 | + and %esi, %edx | |
121 | + jnz L(exit) | |
122 | +L(align16_start): | |
123 | + pxor %xmm0, %xmm0 | |
124 | + pxor %xmm1, %xmm1 | |
125 | + pxor %xmm2, %xmm2 | |
126 | + pxor %xmm3, %xmm3 | |
127 | + .p2align 4 | |
128 | +L(align16_loop): | |
129 | + pcmpeqb 16(%eax), %xmm0 | |
130 | + pmovmskb %xmm0, %edx | |
131 | + test %edx, %edx | |
132 | + jnz L(exit16) | |
133 | + | |
134 | + pcmpeqb 32(%eax), %xmm1 | |
135 | + pmovmskb %xmm1, %edx | |
136 | + test %edx, %edx | |
137 | + jnz L(exit32) | |
138 | + | |
139 | + pcmpeqb 48(%eax), %xmm2 | |
140 | + pmovmskb %xmm2, %edx | |
141 | + test %edx, %edx | |
142 | + jnz L(exit48) | |
143 | + | |
144 | + pcmpeqb 64(%eax), %xmm3 | |
145 | + pmovmskb %xmm3, %edx | |
146 | + lea 64(%eax), %eax | |
147 | + test %edx, %edx | |
148 | + jz L(align16_loop) | |
149 | +L(exit): | |
150 | + sub %edi, %eax | |
151 | +L(exit_less16): | |
152 | + bsf %edx, %edx | |
153 | + add %edx, %eax | |
154 | + RETURN | |
155 | +L(exit16): | |
156 | + sub %edi, %eax | |
157 | + bsf %edx, %edx | |
158 | + add %edx, %eax | |
159 | + add $16, %eax | |
160 | + RETURN | |
161 | +L(exit32): | |
162 | + sub %edi, %eax | |
163 | + bsf %edx, %edx | |
164 | + add %edx, %eax | |
165 | + add $32, %eax | |
166 | + RETURN | |
167 | +L(exit48): | |
168 | + sub %edi, %eax | |
169 | + bsf %edx, %edx | |
170 | + add %edx, %eax | |
171 | + add $48, %eax | |
172 | + POP (%edi) | |
173 | + POP (%esi) | |
174 | + ret | |
175 | + | |
176 | +END ( __strlen_sse2_bsf) | |
177 | + | |
178 | +#endif | |
179 | Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S | |
180 | =================================================================== | |
181 | --- /dev/null | |
182 | +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen-sse2.S | |
183 | @@ -0,0 +1,347 @@ | |
184 | +/* strlen with SSE2 | |
185 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
186 | + Contributed by Intel Corporation. | |
187 | + This file is part of the GNU C Library. | |
188 | + | |
189 | + The GNU C Library is free software; you can redistribute it and/or | |
190 | + modify it under the terms of the GNU Lesser General Public | |
191 | + License as published by the Free Software Foundation; either | |
192 | + version 2.1 of the License, or (at your option) any later version. | |
193 | + | |
194 | + The GNU C Library is distributed in the hope that it will be useful, | |
195 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
196 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
197 | + Lesser General Public License for more details. | |
198 | + | |
199 | + You should have received a copy of the GNU Lesser General Public | |
200 | + License along with the GNU C Library; if not, write to the Free | |
201 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
202 | + 02111-1307 USA. */ | |
203 | + | |
204 | +#if defined SHARED && !defined NOT_IN_libc | |
205 | + | |
206 | +#include <sysdep.h> | |
207 | +#include "asm-syntax.h" | |
208 | + | |
209 | +#define CFI_PUSH(REG) \ | |
210 | + cfi_adjust_cfa_offset (4); \ | |
211 | + cfi_rel_offset (REG, 0) | |
212 | + | |
213 | +#define CFI_POP(REG) \ | |
214 | + cfi_adjust_cfa_offset (-4); \ | |
215 | + cfi_restore (REG) | |
216 | + | |
217 | +#define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
218 | +#define POP(REG) popl REG; CFI_POP (REG) | |
219 | +#define PARMS 4 | |
220 | +#define STR PARMS | |
221 | +#define ENTRANCE | |
222 | +#define RETURN ret | |
223 | + | |
224 | + .text | |
225 | +ENTRY (__strlen_sse2) | |
226 | + ENTRANCE | |
227 | + mov STR(%esp), %edx | |
228 | + xor %eax, %eax | |
229 | + cmpb $0, (%edx) | |
230 | + jz L(exit_tail0) | |
231 | + cmpb $0, 1(%edx) | |
232 | + jz L(exit_tail1) | |
233 | + cmpb $0, 2(%edx) | |
234 | + jz L(exit_tail2) | |
235 | + cmpb $0, 3(%edx) | |
236 | + jz L(exit_tail3) | |
237 | + cmpb $0, 4(%edx) | |
238 | + jz L(exit_tail4) | |
239 | + cmpb $0, 5(%edx) | |
240 | + jz L(exit_tail5) | |
241 | + cmpb $0, 6(%edx) | |
242 | + jz L(exit_tail6) | |
243 | + cmpb $0, 7(%edx) | |
244 | + jz L(exit_tail7) | |
245 | + cmpb $0, 8(%edx) | |
246 | + jz L(exit_tail8) | |
247 | + cmpb $0, 9(%edx) | |
248 | + jz L(exit_tail9) | |
249 | + cmpb $0, 10(%edx) | |
250 | + jz L(exit_tail10) | |
251 | + cmpb $0, 11(%edx) | |
252 | + jz L(exit_tail11) | |
253 | + cmpb $0, 12(%edx) | |
254 | + jz L(exit_tail12) | |
255 | + cmpb $0, 13(%edx) | |
256 | + jz L(exit_tail13) | |
257 | + cmpb $0, 14(%edx) | |
258 | + jz L(exit_tail14) | |
259 | + cmpb $0, 15(%edx) | |
260 | + jz L(exit_tail15) | |
261 | + pxor %xmm0, %xmm0 | |
262 | + mov %edx, %eax | |
263 | + mov %edx, %ecx | |
264 | + and $-16, %eax | |
265 | + add $16, %ecx | |
266 | + add $16, %eax | |
267 | + | |
268 | + pcmpeqb (%eax), %xmm0 | |
269 | + pmovmskb %xmm0, %edx | |
270 | + pxor %xmm1, %xmm1 | |
271 | + test %edx, %edx | |
272 | + lea 16(%eax), %eax | |
273 | + jnz L(exit) | |
274 | + | |
275 | + pcmpeqb (%eax), %xmm1 | |
276 | + pmovmskb %xmm1, %edx | |
277 | + pxor %xmm2, %xmm2 | |
278 | + test %edx, %edx | |
279 | + lea 16(%eax), %eax | |
280 | + jnz L(exit) | |
281 | + | |
282 | + | |
283 | + pcmpeqb (%eax), %xmm2 | |
284 | + pmovmskb %xmm2, %edx | |
285 | + pxor %xmm3, %xmm3 | |
286 | + test %edx, %edx | |
287 | + lea 16(%eax), %eax | |
288 | + jnz L(exit) | |
289 | + | |
290 | + pcmpeqb (%eax), %xmm3 | |
291 | + pmovmskb %xmm3, %edx | |
292 | + test %edx, %edx | |
293 | + lea 16(%eax), %eax | |
294 | + jnz L(exit) | |
295 | + | |
296 | + pcmpeqb (%eax), %xmm0 | |
297 | + pmovmskb %xmm0, %edx | |
298 | + test %edx, %edx | |
299 | + lea 16(%eax), %eax | |
300 | + jnz L(exit) | |
301 | + | |
302 | + pcmpeqb (%eax), %xmm1 | |
303 | + pmovmskb %xmm1, %edx | |
304 | + test %edx, %edx | |
305 | + lea 16(%eax), %eax | |
306 | + jnz L(exit) | |
307 | + | |
308 | + pcmpeqb (%eax), %xmm2 | |
309 | + pmovmskb %xmm2, %edx | |
310 | + test %edx, %edx | |
311 | + lea 16(%eax), %eax | |
312 | + jnz L(exit) | |
313 | + | |
314 | + pcmpeqb (%eax), %xmm3 | |
315 | + pmovmskb %xmm3, %edx | |
316 | + test %edx, %edx | |
317 | + lea 16(%eax), %eax | |
318 | + jnz L(exit) | |
319 | + | |
320 | + pcmpeqb (%eax), %xmm0 | |
321 | + pmovmskb %xmm0, %edx | |
322 | + test %edx, %edx | |
323 | + lea 16(%eax), %eax | |
324 | + jnz L(exit) | |
325 | + | |
326 | + pcmpeqb (%eax), %xmm1 | |
327 | + pmovmskb %xmm1, %edx | |
328 | + test %edx, %edx | |
329 | + lea 16(%eax), %eax | |
330 | + jnz L(exit) | |
331 | + | |
332 | + pcmpeqb (%eax), %xmm2 | |
333 | + pmovmskb %xmm2, %edx | |
334 | + test %edx, %edx | |
335 | + lea 16(%eax), %eax | |
336 | + jnz L(exit) | |
337 | + | |
338 | + pcmpeqb (%eax), %xmm3 | |
339 | + pmovmskb %xmm3, %edx | |
340 | + test %edx, %edx | |
341 | + lea 16(%eax), %eax | |
342 | + jnz L(exit) | |
343 | + | |
344 | + pcmpeqb (%eax), %xmm0 | |
345 | + pmovmskb %xmm0, %edx | |
346 | + test %edx, %edx | |
347 | + lea 16(%eax), %eax | |
348 | + jnz L(exit) | |
349 | + | |
350 | + pcmpeqb (%eax), %xmm1 | |
351 | + pmovmskb %xmm1, %edx | |
352 | + test %edx, %edx | |
353 | + lea 16(%eax), %eax | |
354 | + jnz L(exit) | |
355 | + | |
356 | + pcmpeqb (%eax), %xmm2 | |
357 | + pmovmskb %xmm2, %edx | |
358 | + test %edx, %edx | |
359 | + lea 16(%eax), %eax | |
360 | + jnz L(exit) | |
361 | + | |
362 | + pcmpeqb (%eax), %xmm3 | |
363 | + pmovmskb %xmm3, %edx | |
364 | + test %edx, %edx | |
365 | + lea 16(%eax), %eax | |
366 | + jnz L(exit) | |
367 | + | |
368 | + and $-0x40, %eax | |
369 | + PUSH (%esi) | |
370 | + PUSH (%edi) | |
371 | + PUSH (%ebx) | |
372 | + PUSH (%ebp) | |
373 | + xor %ebp, %ebp | |
374 | +L(aligned_64): | |
375 | + pcmpeqb (%eax), %xmm0 | |
376 | + pcmpeqb 16(%eax), %xmm1 | |
377 | + pcmpeqb 32(%eax), %xmm2 | |
378 | + pcmpeqb 48(%eax), %xmm3 | |
379 | + pmovmskb %xmm0, %edx | |
380 | + pmovmskb %xmm1, %esi | |
381 | + pmovmskb %xmm2, %edi | |
382 | + pmovmskb %xmm3, %ebx | |
383 | + or %edx, %ebp | |
384 | + or %esi, %ebp | |
385 | + or %edi, %ebp | |
386 | + or %ebx, %ebp | |
387 | + lea 64(%eax), %eax | |
388 | + jz L(aligned_64) | |
389 | +L(48leave): | |
390 | + test %edx, %edx | |
391 | + jnz L(aligned_64_exit_16) | |
392 | + test %esi, %esi | |
393 | + jnz L(aligned_64_exit_32) | |
394 | + test %edi, %edi | |
395 | + jnz L(aligned_64_exit_48) | |
396 | + mov %ebx, %edx | |
397 | + lea (%eax), %eax | |
398 | + jmp L(aligned_64_exit) | |
399 | +L(aligned_64_exit_48): | |
400 | + lea -16(%eax), %eax | |
401 | + mov %edi, %edx | |
402 | + jmp L(aligned_64_exit) | |
403 | +L(aligned_64_exit_32): | |
404 | + lea -32(%eax), %eax | |
405 | + mov %esi, %edx | |
406 | + jmp L(aligned_64_exit) | |
407 | +L(aligned_64_exit_16): | |
408 | + lea -48(%eax), %eax | |
409 | +L(aligned_64_exit): | |
410 | + POP (%ebp) | |
411 | + POP (%ebx) | |
412 | + POP (%edi) | |
413 | + POP (%esi) | |
414 | +L(exit): | |
415 | + sub %ecx, %eax | |
416 | + test %dl, %dl | |
417 | + jz L(exit_high) | |
418 | + test $0x01, %dl | |
419 | + jnz L(exit_tail0) | |
420 | + | |
421 | + test $0x02, %dl | |
422 | + jnz L(exit_tail1) | |
423 | + | |
424 | + test $0x04, %dl | |
425 | + jnz L(exit_tail2) | |
426 | + | |
427 | + test $0x08, %dl | |
428 | + jnz L(exit_tail3) | |
429 | + | |
430 | + test $0x10, %dl | |
431 | + jnz L(exit_tail4) | |
432 | + | |
433 | + test $0x20, %dl | |
434 | + jnz L(exit_tail5) | |
435 | + | |
436 | + test $0x40, %dl | |
437 | + jnz L(exit_tail6) | |
438 | + add $7, %eax | |
439 | +L(exit_tail0): | |
440 | + RETURN | |
441 | + | |
442 | +L(exit_high): | |
443 | + add $8, %eax | |
444 | + test $0x01, %dh | |
445 | + jnz L(exit_tail0) | |
446 | + | |
447 | + test $0x02, %dh | |
448 | + jnz L(exit_tail1) | |
449 | + | |
450 | + test $0x04, %dh | |
451 | + jnz L(exit_tail2) | |
452 | + | |
453 | + test $0x08, %dh | |
454 | + jnz L(exit_tail3) | |
455 | + | |
456 | + test $0x10, %dh | |
457 | + jnz L(exit_tail4) | |
458 | + | |
459 | + test $0x20, %dh | |
460 | + jnz L(exit_tail5) | |
461 | + | |
462 | + test $0x40, %dh | |
463 | + jnz L(exit_tail6) | |
464 | + add $7, %eax | |
465 | + RETURN | |
466 | + | |
467 | + .p2align 4 | |
468 | +L(exit_tail1): | |
469 | + add $1, %eax | |
470 | + RETURN | |
471 | + | |
472 | +L(exit_tail2): | |
473 | + add $2, %eax | |
474 | + RETURN | |
475 | + | |
476 | +L(exit_tail3): | |
477 | + add $3, %eax | |
478 | + RETURN | |
479 | + | |
480 | +L(exit_tail4): | |
481 | + add $4, %eax | |
482 | + RETURN | |
483 | + | |
484 | +L(exit_tail5): | |
485 | + add $5, %eax | |
486 | + RETURN | |
487 | + | |
488 | +L(exit_tail6): | |
489 | + add $6, %eax | |
490 | + RETURN | |
491 | + | |
492 | +L(exit_tail7): | |
493 | + add $7, %eax | |
494 | + RETURN | |
495 | + | |
496 | +L(exit_tail8): | |
497 | + add $8, %eax | |
498 | + RETURN | |
499 | + | |
500 | +L(exit_tail9): | |
501 | + add $9, %eax | |
502 | + RETURN | |
503 | + | |
504 | +L(exit_tail10): | |
505 | + add $10, %eax | |
506 | + RETURN | |
507 | + | |
508 | +L(exit_tail11): | |
509 | + add $11, %eax | |
510 | + RETURN | |
511 | + | |
512 | +L(exit_tail12): | |
513 | + add $12, %eax | |
514 | + RETURN | |
515 | + | |
516 | +L(exit_tail13): | |
517 | + add $13, %eax | |
518 | + RETURN | |
519 | + | |
520 | +L(exit_tail14): | |
521 | + add $14, %eax | |
522 | + RETURN | |
523 | + | |
524 | +L(exit_tail15): | |
525 | + add $15, %eax | |
526 | + ret | |
527 | + | |
528 | +END (__strlen_sse2) | |
529 | + | |
530 | +#endif | |
531 | Index: glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S | |
532 | =================================================================== | |
533 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/i386/i686/multiarch/strlen.S | |
534 | +++ glibc-2.12-2-gc4ccff1/sysdeps/i386/i686/multiarch/strlen.S | |
535 | @@ -48,6 +48,9 @@ ENTRY(strlen) | |
536 | 1: leal __strlen_ia32@GOTOFF(%ebx), %eax | |
537 | testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) | |
538 | jz 2f | |
539 | + leal __strlen_sse2_bsf@GOTOFF(%ebx), %eax | |
540 | + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx) | |
541 | + jz 2f | |
542 | leal __strlen_sse2@GOTOFF(%ebx), %eax | |
543 | 2: popl %ebx | |
544 | cfi_adjust_cfa_offset (-4); | |
545 | @@ -55,84 +58,6 @@ ENTRY(strlen) | |
546 | ret | |
547 | END(strlen) | |
548 | ||
549 | -#define CFI_POP(REG) \ | |
550 | - cfi_adjust_cfa_offset (-4); \ | |
551 | - cfi_restore (REG) | |
552 | - | |
553 | -#define RETURN popl %esi; CFI_POP (esi); ret | |
554 | - | |
555 | - .text | |
556 | -ENTRY (__strlen_sse2) | |
557 | -/* | |
558 | - * This implementation uses SSE instructions to compare up to 16 bytes | |
559 | - * at a time looking for the end of string (null char). | |
560 | - */ | |
561 | - pushl %esi | |
562 | - cfi_adjust_cfa_offset (4) | |
563 | - cfi_rel_offset (%esi, 0) | |
564 | - mov 8(%esp), %eax | |
565 | - mov %eax, %ecx | |
566 | - pxor %xmm0, %xmm0 /* 16 null chars */ | |
567 | - mov %eax, %esi | |
568 | - and $15, %ecx | |
569 | - jz 1f /* string is 16 byte aligned */ | |
570 | - | |
571 | - /* | |
572 | - * Unaligned case. Round down to 16-byte boundary before comparing | |
573 | - * 16 bytes for a null char. The code then compensates for any extra chars | |
574 | - * preceding the start of the string. | |
575 | - */ | |
576 | - and $-16, %esi | |
577 | - | |
578 | - pcmpeqb (%esi), %xmm0 | |
579 | - lea 16(%eax), %esi | |
580 | - pmovmskb %xmm0, %edx | |
581 | - | |
582 | - shr %cl, %edx /* Compensate for bytes preceding the string */ | |
583 | - test %edx, %edx | |
584 | - jnz 2f | |
585 | - sub %ecx, %esi /* no null, adjust to next 16-byte boundary */ | |
586 | - pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */ | |
587 | - | |
588 | - .p2align 4 | |
589 | -1: /* 16 byte aligned */ | |
590 | - pcmpeqb (%esi), %xmm0 /* look for null bytes */ | |
591 | - pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */ | |
592 | - | |
593 | - add $16, %esi /* prepare to search next 16 bytes */ | |
594 | - test %edx, %edx /* if no null byte, %edx must be 0 */ | |
595 | - jnz 2f /* found a null */ | |
596 | - | |
597 | - pcmpeqb (%esi), %xmm0 | |
598 | - pmovmskb %xmm0, %edx | |
599 | - add $16, %esi | |
600 | - test %edx, %edx | |
601 | - jnz 2f | |
602 | - | |
603 | - pcmpeqb (%esi), %xmm0 | |
604 | - pmovmskb %xmm0, %edx | |
605 | - add $16, %esi | |
606 | - test %edx, %edx | |
607 | - jnz 2f | |
608 | - | |
609 | - pcmpeqb (%esi), %xmm0 | |
610 | - pmovmskb %xmm0, %edx | |
611 | - add $16, %esi | |
612 | - test %edx, %edx | |
613 | - jz 1b | |
614 | - | |
615 | -2: | |
616 | - neg %eax | |
617 | - lea -16(%eax, %esi), %eax /* calculate exact offset */ | |
618 | - bsf %edx, %ecx /* Least significant 1 bit is index of null */ | |
619 | - add %ecx, %eax | |
620 | - popl %esi | |
621 | - cfi_adjust_cfa_offset (-4) | |
622 | - cfi_restore (%esi) | |
623 | - ret | |
624 | - | |
625 | -END (__strlen_sse2) | |
626 | - | |
627 | # undef ENTRY | |
628 | # define ENTRY(name) \ | |
629 | .type __strlen_ia32, @function; \ | |
630 | --- a/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:43:30.060487726 -0700 | |
631 | +++ b/sysdeps/x86_64/multiarch/Makefile 2012-03-01 10:45:57.894692115 -0700 | |
632 | @@ -7,7 +7,7 @@ ifeq ($(subdir),string) | |
633 | sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ | |
634 | strend-sse4 memcmp-sse4 \ | |
635 | strcasestr-nonascii strcasecmp_l-ssse3 \ | |
636 | - strncase_l-ssse3 \ | |
637 | + strncase_l-ssse3 strlen-sse4 strlen-no-bsf \ | |
638 | memset-x86-64 | |
639 | ifeq (yes,$(config-cflags-sse4)) | |
640 | sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c | |
641 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c | |
642 | =================================================================== | |
643 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/init-arch.c | |
644 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/init-arch.c | |
645 | @@ -77,6 +77,12 @@ __init_cpu_features (void) | |
646 | model += extended_model; | |
647 | switch (model) | |
648 | { | |
649 | + case 0x1c: | |
650 | + case 0x26: | |
651 | + /* BSF is slow on Atom. */ | |
652 | + __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF; | |
653 | + break; | |
654 | + | |
655 | case 0x1a: | |
656 | case 0x1e: | |
657 | case 0x1f: | |
658 | --- a/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:43:30.061487720 -0700 | |
659 | +++ b/sysdeps/x86_64/multiarch/init-arch.h 2012-03-01 10:48:13.371963005 -0700 | |
660 | @@ -17,6 +17,7 @@ | |
661 | 02111-1307 USA. */ | |
662 | ||
663 | #define bit_Fast_Rep_String (1 << 0) | |
664 | +#define bit_Slow_BSF (1 << 2) | |
665 | #define bit_Prefer_SSE_for_memop (1 << 3) | |
666 | ||
667 | #ifdef __ASSEMBLER__ | |
668 | @@ -34,6 +35,7 @@ | |
669 | # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET | |
670 | ||
671 | #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE | |
672 | +# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE | |
673 | # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE | |
674 | ||
675 | #else /* __ASSEMBLER__ */ | |
676 | @@ -105,11 +107,15 @@ extern const struct cpu_features *__get_ | |
677 | # define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) | |
678 | ||
679 | # define index_Fast_Rep_String FEATURE_INDEX_1 | |
680 | +# define index_Slow_BSF FEATURE_INDEX_1 | |
681 | # define index_Prefer_SSE_for_memop FEATURE_INDEX_1 | |
682 | ||
683 | #define HAS_ARCH_FEATURE(idx, bit) \ | |
684 | ((__get_cpu_features ()->feature[idx] & (bit)) != 0) | |
685 | ||
686 | +#define HAS_SLOW_BSF \ | |
687 | + HAS_ARCH_FEATURE (index_Slow_BSF, bit_Slow_BSF) | |
688 | + | |
689 | #define HAS_PREFER_SSE_FOR_MEMOP \ | |
690 | HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop) | |
691 | ||
692 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S | |
693 | =================================================================== | |
694 | --- /dev/null | |
695 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-no-bsf.S | |
696 | @@ -0,0 +1,309 @@ | |
697 | +/* strlen without BSF | |
698 | + Copyright (C) 2010 Free Software Foundation, Inc. | |
699 | + Contributed by Intel Corporation. | |
700 | + This file is part of the GNU C Library. | |
701 | + | |
702 | + The GNU C Library is free software; you can redistribute it and/or | |
703 | + modify it under the terms of the GNU Lesser General Public | |
704 | + License as published by the Free Software Foundation; either | |
705 | + version 2.1 of the License, or (at your option) any later version. | |
706 | + | |
707 | + The GNU C Library is distributed in the hope that it will be useful, | |
708 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
709 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
710 | + Lesser General Public License for more details. | |
711 | + | |
712 | + You should have received a copy of the GNU Lesser General Public | |
713 | + License along with the GNU C Library; if not, write to the Free | |
714 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
715 | + 02111-1307 USA. */ | |
716 | + | |
717 | +#if defined SHARED && !defined NOT_IN_libc | |
718 | + | |
719 | +#include <sysdep.h> | |
720 | + | |
721 | + .section .text.slow,"ax",@progbits | |
722 | +ENTRY (__strlen_no_bsf) | |
723 | + xor %eax, %eax | |
724 | + cmpb $0, (%rdi) | |
725 | + jz L(exit_tail0) | |
726 | + cmpb $0, 1(%rdi) | |
727 | + jz L(exit_tail1) | |
728 | + cmpb $0, 2(%rdi) | |
729 | + jz L(exit_tail2) | |
730 | + cmpb $0, 3(%rdi) | |
731 | + jz L(exit_tail3) | |
732 | + cmpb $0, 4(%rdi) | |
733 | + jz L(exit_tail4) | |
734 | + cmpb $0, 5(%rdi) | |
735 | + jz L(exit_tail5) | |
736 | + cmpb $0, 6(%rdi) | |
737 | + jz L(exit_tail6) | |
738 | + cmpb $0, 7(%rdi) | |
739 | + jz L(exit_tail7) | |
740 | + cmpb $0, 8(%rdi) | |
741 | + jz L(exit_tail8) | |
742 | + cmpb $0, 9(%rdi) | |
743 | + jz L(exit_tail9) | |
744 | + cmpb $0, 10(%rdi) | |
745 | + jz L(exit_tail10) | |
746 | + cmpb $0, 11(%rdi) | |
747 | + jz L(exit_tail11) | |
748 | + cmpb $0, 12(%rdi) | |
749 | + jz L(exit_tail12) | |
750 | + cmpb $0, 13(%rdi) | |
751 | + jz L(exit_tail13) | |
752 | + cmpb $0, 14(%rdi) | |
753 | + jz L(exit_tail14) | |
754 | + cmpb $0, 15(%rdi) | |
755 | + jz L(exit_tail15) | |
756 | + pxor %xmm0, %xmm0 | |
757 | + mov %rdi, %rcx | |
758 | + mov %rdi, %rax | |
759 | + and $-16, %rax | |
760 | + add $16, %rax | |
761 | + add $16, %rcx | |
762 | + | |
763 | + pcmpeqb (%rax), %xmm0 | |
764 | + pmovmskb %xmm0, %edx | |
765 | + pxor %xmm1, %xmm1 | |
766 | + test %edx, %edx | |
767 | + lea 16(%rax), %rax | |
768 | + jnz L(exit) | |
769 | + | |
770 | + pcmpeqb (%rax), %xmm1 | |
771 | + pmovmskb %xmm1, %edx | |
772 | + pxor %xmm2, %xmm2 | |
773 | + test %edx, %edx | |
774 | + lea 16(%rax), %rax | |
775 | + jnz L(exit) | |
776 | + | |
777 | + | |
778 | + pcmpeqb (%rax), %xmm2 | |
779 | + pmovmskb %xmm2, %edx | |
780 | + pxor %xmm3, %xmm3 | |
781 | + test %edx, %edx | |
782 | + lea 16(%rax), %rax | |
783 | + jnz L(exit) | |
784 | + | |
785 | + pcmpeqb (%rax), %xmm3 | |
786 | + pmovmskb %xmm3, %edx | |
787 | + test %edx, %edx | |
788 | + lea 16(%rax), %rax | |
789 | + jnz L(exit) | |
790 | + | |
791 | + pcmpeqb (%rax), %xmm0 | |
792 | + pmovmskb %xmm0, %edx | |
793 | + test %edx, %edx | |
794 | + lea 16(%rax), %rax | |
795 | + jnz L(exit) | |
796 | + | |
797 | + pcmpeqb (%rax), %xmm1 | |
798 | + pmovmskb %xmm1, %edx | |
799 | + test %edx, %edx | |
800 | + lea 16(%rax), %rax | |
801 | + jnz L(exit) | |
802 | + | |
803 | + pcmpeqb (%rax), %xmm2 | |
804 | + pmovmskb %xmm2, %edx | |
805 | + test %edx, %edx | |
806 | + lea 16(%rax), %rax | |
807 | + jnz L(exit) | |
808 | + | |
809 | + pcmpeqb (%rax), %xmm3 | |
810 | + pmovmskb %xmm3, %edx | |
811 | + test %edx, %edx | |
812 | + lea 16(%rax), %rax | |
813 | + jnz L(exit) | |
814 | + | |
815 | + pcmpeqb (%rax), %xmm0 | |
816 | + pmovmskb %xmm0, %edx | |
817 | + test %edx, %edx | |
818 | + lea 16(%rax), %rax | |
819 | + jnz L(exit) | |
820 | + | |
821 | + pcmpeqb (%rax), %xmm1 | |
822 | + pmovmskb %xmm1, %edx | |
823 | + test %edx, %edx | |
824 | + lea 16(%rax), %rax | |
825 | + jnz L(exit) | |
826 | + | |
827 | + pcmpeqb (%rax), %xmm2 | |
828 | + pmovmskb %xmm2, %edx | |
829 | + test %edx, %edx | |
830 | + lea 16(%rax), %rax | |
831 | + jnz L(exit) | |
832 | + | |
833 | + pcmpeqb (%rax), %xmm3 | |
834 | + pmovmskb %xmm3, %edx | |
835 | + test %edx, %edx | |
836 | + lea 16(%rax), %rax | |
837 | + jnz L(exit) | |
838 | + | |
839 | + pcmpeqb (%rax), %xmm0 | |
840 | + pmovmskb %xmm0, %edx | |
841 | + test %edx, %edx | |
842 | + lea 16(%rax), %rax | |
843 | + jnz L(exit) | |
844 | + | |
845 | + pcmpeqb (%rax), %xmm1 | |
846 | + pmovmskb %xmm1, %edx | |
847 | + test %edx, %edx | |
848 | + lea 16(%rax), %rax | |
849 | + jnz L(exit) | |
850 | + | |
851 | + pcmpeqb (%rax), %xmm2 | |
852 | + pmovmskb %xmm2, %edx | |
853 | + test %edx, %edx | |
854 | + lea 16(%rax), %rax | |
855 | + jnz L(exit) | |
856 | + | |
857 | + pcmpeqb (%rax), %xmm3 | |
858 | + pmovmskb %xmm3, %edx | |
859 | + test %edx, %edx | |
860 | + lea 16(%rax), %rax | |
861 | + jnz L(exit) | |
862 | + | |
863 | + and $-0x40, %rax | |
864 | + xor %r8d, %r8d | |
865 | +L(aligned_64): | |
866 | + pcmpeqb (%rax), %xmm0 | |
867 | + pcmpeqb 16(%rax), %xmm1 | |
868 | + pcmpeqb 32(%rax), %xmm2 | |
869 | + pcmpeqb 48(%rax), %xmm3 | |
870 | + pmovmskb %xmm0, %edx | |
871 | + pmovmskb %xmm1, %esi | |
872 | + pmovmskb %xmm2, %edi | |
873 | + pmovmskb %xmm3, %r9d | |
874 | + or %edx, %r8d | |
875 | + or %esi, %r8d | |
876 | + or %edi, %r8d | |
877 | + or %r9d, %r8d | |
878 | + lea 64(%rax), %rax | |
879 | + jz L(aligned_64) | |
880 | + | |
881 | + test %edx, %edx | |
882 | + jnz L(aligned_64_exit_16) | |
883 | + test %esi, %esi | |
884 | + jnz L(aligned_64_exit_32) | |
885 | + test %edi, %edi | |
886 | + jnz L(aligned_64_exit_48) | |
887 | +L(aligned_64_exit_64): | |
888 | + mov %r9d, %edx | |
889 | + jmp L(aligned_64_exit) | |
890 | +L(aligned_64_exit_48): | |
891 | + lea -16(%rax), %rax | |
892 | + mov %edi, %edx | |
893 | + jmp L(aligned_64_exit) | |
894 | +L(aligned_64_exit_32): | |
895 | + lea -32(%rax), %rax | |
896 | + mov %esi, %edx | |
897 | + jmp L(aligned_64_exit) | |
898 | +L(aligned_64_exit_16): | |
899 | + lea -48(%rax), %rax | |
900 | +L(aligned_64_exit): | |
901 | +L(exit): | |
902 | + sub %rcx, %rax | |
903 | + test %dl, %dl | |
904 | + jz L(exit_high) | |
905 | + test $0x01, %dl | |
906 | + jnz L(exit_tail0) | |
907 | + | |
908 | + test $0x02, %dl | |
909 | + jnz L(exit_tail1) | |
910 | + | |
911 | + test $0x04, %dl | |
912 | + jnz L(exit_tail2) | |
913 | + | |
914 | + test $0x08, %dl | |
915 | + jnz L(exit_tail3) | |
916 | + | |
917 | + test $0x10, %dl | |
918 | + jnz L(exit_tail4) | |
919 | + | |
920 | + test $0x20, %dl | |
921 | + jnz L(exit_tail5) | |
922 | + | |
923 | + test $0x40, %dl | |
924 | + jnz L(exit_tail6) | |
925 | + add $7, %eax | |
926 | +L(exit_tail0): | |
927 | + ret | |
928 | + | |
929 | +L(exit_high): | |
930 | + add $8, %eax | |
931 | + test $0x01, %dh | |
932 | + jnz L(exit_tail0) | |
933 | + | |
934 | + test $0x02, %dh | |
935 | + jnz L(exit_tail1) | |
936 | + | |
937 | + test $0x04, %dh | |
938 | + jnz L(exit_tail2) | |
939 | + | |
940 | + test $0x08, %dh | |
941 | + jnz L(exit_tail3) | |
942 | + | |
943 | + test $0x10, %dh | |
944 | + jnz L(exit_tail4) | |
945 | + | |
946 | + test $0x20, %dh | |
947 | + jnz L(exit_tail5) | |
948 | + | |
949 | + test $0x40, %dh | |
950 | + jnz L(exit_tail6) | |
951 | + add $7, %eax | |
952 | + ret | |
953 | + .p2align 4 | |
954 | +L(exit_tail1): | |
955 | + add $1, %eax | |
956 | + ret | |
957 | + | |
958 | +L(exit_tail2): | |
959 | + add $2, %eax | |
960 | + ret | |
961 | + | |
962 | +L(exit_tail3): | |
963 | + add $3, %eax | |
964 | + ret | |
965 | + | |
966 | +L(exit_tail4): | |
967 | + add $4, %eax | |
968 | + ret | |
969 | + | |
970 | +L(exit_tail5): | |
971 | + add $5, %eax | |
972 | + ret | |
973 | +L(exit_tail6): | |
974 | + add $6, %eax | |
975 | + ret | |
976 | +L(exit_tail7): | |
977 | + add $7, %eax | |
978 | + ret | |
979 | +L(exit_tail8): | |
980 | + add $8, %eax | |
981 | + ret | |
982 | +L(exit_tail9): | |
983 | + add $9, %eax | |
984 | + ret | |
985 | +L(exit_tail10): | |
986 | + add $10, %eax | |
987 | + ret | |
988 | +L(exit_tail11): | |
989 | + add $11, %eax | |
990 | + ret | |
991 | +L(exit_tail12): | |
992 | + add $12, %eax | |
993 | + ret | |
994 | +L(exit_tail13): | |
995 | + add $13, %eax | |
996 | + ret | |
997 | +L(exit_tail14): | |
998 | + add $14, %eax | |
999 | + ret | |
1000 | +L(exit_tail15): | |
1001 | + add $15, %eax | |
1002 | + ret | |
1003 | +END (__strlen_no_bsf) | |
1004 | + | |
1005 | +#endif | |
1006 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S | |
1007 | =================================================================== | |
1008 | --- /dev/null | |
1009 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen-sse4.S | |
1010 | @@ -0,0 +1,85 @@ | |
1011 | +/* strlen with SSE4 | |
1012 | + Copyright (C) 2009, 2010 Free Software Foundation, Inc. | |
1013 | + Contributed by Ulrich Drepper <drepper@redhat.com>. | |
1014 | + This file is part of the GNU C Library. | |
1015 | + | |
1016 | + The GNU C Library is free software; you can redistribute it and/or | |
1017 | + modify it under the terms of the GNU Lesser General Public | |
1018 | + License as published by the Free Software Foundation; either | |
1019 | + version 2.1 of the License, or (at your option) any later version. | |
1020 | + | |
1021 | + The GNU C Library is distributed in the hope that it will be useful, | |
1022 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1023 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
1024 | + Lesser General Public License for more details. | |
1025 | + | |
1026 | + You should have received a copy of the GNU Lesser General Public | |
1027 | + License along with the GNU C Library; if not, write to the Free | |
1028 | + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
1029 | + 02111-1307 USA. */ | |
1030 | + | |
1031 | +#if defined SHARED && !defined NOT_IN_libc | |
1032 | + | |
1033 | +#include <sysdep.h> | |
1034 | + | |
1035 | + .section .text.sse4.2,"ax",@progbits | |
1036 | +ENTRY (__strlen_sse42) | |
1037 | + pxor %xmm1, %xmm1 | |
1038 | + movl %edi, %ecx | |
1039 | + movq %rdi, %r8 | |
1040 | + andq $~15, %rdi | |
1041 | + xor %edi, %ecx | |
1042 | + pcmpeqb (%rdi), %xmm1 | |
1043 | + pmovmskb %xmm1, %edx | |
1044 | + shrl %cl, %edx | |
1045 | + shll %cl, %edx | |
1046 | + andl %edx, %edx | |
1047 | + jnz L(less16bytes) | |
1048 | + pxor %xmm1, %xmm1 | |
1049 | + | |
1050 | + .p2align 4 | |
1051 | +L(more64bytes_loop): | |
1052 | + pcmpistri $0x08, 16(%rdi), %xmm1 | |
1053 | + jz L(more32bytes) | |
1054 | + | |
1055 | + pcmpistri $0x08, 32(%rdi), %xmm1 | |
1056 | + jz L(more48bytes) | |
1057 | + | |
1058 | + pcmpistri $0x08, 48(%rdi), %xmm1 | |
1059 | + jz L(more64bytes) | |
1060 | + | |
1061 | + add $64, %rdi | |
1062 | + pcmpistri $0x08, (%rdi), %xmm1 | |
1063 | + jnz L(more64bytes_loop) | |
1064 | + leaq (%rdi,%rcx), %rax | |
1065 | + subq %r8, %rax | |
1066 | + ret | |
1067 | + | |
1068 | + .p2align 4 | |
1069 | +L(more32bytes): | |
1070 | + leaq 16(%rdi,%rcx, 1), %rax | |
1071 | + subq %r8, %rax | |
1072 | + ret | |
1073 | + | |
1074 | + .p2align 4 | |
1075 | +L(more48bytes): | |
1076 | + leaq 32(%rdi,%rcx, 1), %rax | |
1077 | + subq %r8, %rax | |
1078 | + ret | |
1079 | + | |
1080 | + .p2align 4 | |
1081 | +L(more64bytes): | |
1082 | + leaq 48(%rdi,%rcx, 1), %rax | |
1083 | + subq %r8, %rax | |
1084 | + ret | |
1085 | + | |
1086 | + .p2align 4 | |
1087 | +L(less16bytes): | |
1088 | + subq %r8, %rdi | |
1089 | + bsfl %edx, %eax | |
1090 | + addq %rdi, %rax | |
1091 | + ret | |
1092 | + | |
1093 | +END (__strlen_sse42) | |
1094 | + | |
1095 | +#endif | |
1096 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S | |
1097 | =================================================================== | |
1098 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/multiarch/strlen.S | |
1099 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/multiarch/strlen.S | |
1100 | @@ -36,74 +36,12 @@ ENTRY(strlen) | |
1101 | testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) | |
1102 | jz 2f | |
1103 | leaq __strlen_sse42(%rip), %rax | |
1104 | -2: ret | |
1105 | -END(strlen) | |
1106 | - | |
1107 | - | |
1108 | - .section .text.sse4.2,"ax",@progbits | |
1109 | - .align 16 | |
1110 | - .type __strlen_sse42, @function | |
1111 | -__strlen_sse42: | |
1112 | - cfi_startproc | |
1113 | - CALL_MCOUNT | |
1114 | - pxor %xmm1, %xmm1 | |
1115 | - movl %edi, %ecx | |
1116 | - movq %rdi, %r8 | |
1117 | - andq $~15, %rdi | |
1118 | - xor %edi, %ecx | |
1119 | - pcmpeqb (%rdi), %xmm1 | |
1120 | - pmovmskb %xmm1, %edx | |
1121 | - shrl %cl, %edx | |
1122 | - shll %cl, %edx | |
1123 | - andl %edx, %edx | |
1124 | - jnz L(less16bytes) | |
1125 | - pxor %xmm1, %xmm1 | |
1126 | - | |
1127 | - .p2align 4 | |
1128 | -L(more64bytes_loop): | |
1129 | - pcmpistri $0x08, 16(%rdi), %xmm1 | |
1130 | - jz L(more32bytes) | |
1131 | - | |
1132 | - pcmpistri $0x08, 32(%rdi), %xmm1 | |
1133 | - jz L(more48bytes) | |
1134 | - | |
1135 | - pcmpistri $0x08, 48(%rdi), %xmm1 | |
1136 | - jz L(more64bytes) | |
1137 | - | |
1138 | - add $64, %rdi | |
1139 | - pcmpistri $0x08, (%rdi), %xmm1 | |
1140 | - jnz L(more64bytes_loop) | |
1141 | - leaq (%rdi,%rcx), %rax | |
1142 | - subq %r8, %rax | |
1143 | - ret | |
1144 | - | |
1145 | - .p2align 4 | |
1146 | -L(more32bytes): | |
1147 | - leaq 16(%rdi,%rcx, 1), %rax | |
1148 | - subq %r8, %rax | |
1149 | - ret | |
1150 | - | |
1151 | - .p2align 4 | |
1152 | -L(more48bytes): | |
1153 | - leaq 32(%rdi,%rcx, 1), %rax | |
1154 | - subq %r8, %rax | |
1155 | - ret | |
1156 | - | |
1157 | - .p2align 4 | |
1158 | -L(more64bytes): | |
1159 | - leaq 48(%rdi,%rcx, 1), %rax | |
1160 | - subq %r8, %rax | |
1161 | ret | |
1162 | - | |
1163 | - .p2align 4 | |
1164 | -L(less16bytes): | |
1165 | - subq %r8, %rdi | |
1166 | - bsfl %edx, %eax | |
1167 | - addq %rdi, %rax | |
1168 | - ret | |
1169 | - cfi_endproc | |
1170 | - .size __strlen_sse42, .-__strlen_sse42 | |
1171 | - | |
1172 | +2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) | |
1173 | + jz 3f | |
1174 | + leaq __strlen_no_bsf(%rip), %rax | |
1175 | +3: ret | |
1176 | +END(strlen) | |
1177 | ||
1178 | # undef ENTRY | |
1179 | # define ENTRY(name) \ | |
1180 | Index: glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S | |
1181 | =================================================================== | |
1182 | --- glibc-2.12-2-gc4ccff1.orig/sysdeps/x86_64/strlen.S | |
1183 | +++ glibc-2.12-2-gc4ccff1/sysdeps/x86_64/strlen.S | |
1184 | @@ -23,29 +23,80 @@ | |
1185 | ||
1186 | .text | |
1187 | ENTRY(strlen) | |
1188 | - pxor %xmm2, %xmm2 | |
1189 | - movq %rdi, %rcx | |
1190 | - movq %rdi, %r8 | |
1191 | - andq $~15, %rdi | |
1192 | - movdqa %xmm2, %xmm1 | |
1193 | - pcmpeqb (%rdi), %xmm2 | |
1194 | - orl $0xffffffff, %esi | |
1195 | - subq %rdi, %rcx | |
1196 | - shll %cl, %esi | |
1197 | - pmovmskb %xmm2, %edx | |
1198 | - andl %esi, %edx | |
1199 | - jnz 1f | |
1200 | - | |
1201 | -2: movdqa 16(%rdi), %xmm0 | |
1202 | - leaq 16(%rdi), %rdi | |
1203 | + xor %rax, %rax | |
1204 | + mov %edi, %ecx | |
1205 | + and $0x3f, %ecx | |
1206 | + pxor %xmm0, %xmm0 | |
1207 | + cmp $0x30, %ecx | |
1208 | + ja L(next) | |
1209 | + movdqu (%rdi), %xmm1 | |
1210 | pcmpeqb %xmm1, %xmm0 | |
1211 | pmovmskb %xmm0, %edx | |
1212 | - testl %edx, %edx | |
1213 | - jz 2b | |
1214 | + test %edx, %edx | |
1215 | + jnz L(exit_less16) | |
1216 | + mov %rdi, %rax | |
1217 | + and $-16, %rax | |
1218 | + jmp L(align16_start) | |
1219 | +L(next): | |
1220 | + mov %rdi, %rax | |
1221 | + and $-16, %rax | |
1222 | + pcmpeqb (%rax), %xmm0 | |
1223 | + mov $-1, %esi | |
1224 | + sub %rax, %rcx | |
1225 | + shl %cl, %esi | |
1226 | + pmovmskb %xmm0, %edx | |
1227 | + and %esi, %edx | |
1228 | + jnz L(exit) | |
1229 | +L(align16_start): | |
1230 | + pxor %xmm0, %xmm0 | |
1231 | + pxor %xmm1, %xmm1 | |
1232 | + pxor %xmm2, %xmm2 | |
1233 | + pxor %xmm3, %xmm3 | |
1234 | + .p2align 4 | |
1235 | +L(align16_loop): | |
1236 | + pcmpeqb 16(%rax), %xmm0 | |
1237 | + pmovmskb %xmm0, %edx | |
1238 | + test %edx, %edx | |
1239 | + jnz L(exit16) | |
1240 | ||
1241 | -1: subq %r8, %rdi | |
1242 | - bsfl %edx, %eax | |
1243 | - addq %rdi, %rax | |
1244 | + pcmpeqb 32(%rax), %xmm1 | |
1245 | + pmovmskb %xmm1, %edx | |
1246 | + test %edx, %edx | |
1247 | + jnz L(exit32) | |
1248 | + | |
1249 | + pcmpeqb 48(%rax), %xmm2 | |
1250 | + pmovmskb %xmm2, %edx | |
1251 | + test %edx, %edx | |
1252 | + jnz L(exit48) | |
1253 | + | |
1254 | + pcmpeqb 64(%rax), %xmm3 | |
1255 | + pmovmskb %xmm3, %edx | |
1256 | + lea 64(%rax), %rax | |
1257 | + test %edx, %edx | |
1258 | + jz L(align16_loop) | |
1259 | +L(exit): | |
1260 | + sub %rdi, %rax | |
1261 | +L(exit_less16): | |
1262 | + bsf %rdx, %rdx | |
1263 | + add %rdx, %rax | |
1264 | + ret | |
1265 | + .p2align 4 | |
1266 | +L(exit16): | |
1267 | + sub %rdi, %rax | |
1268 | + bsf %rdx, %rdx | |
1269 | + lea 16(%rdx,%rax), %rax | |
1270 | + ret | |
1271 | + .p2align 4 | |
1272 | +L(exit32): | |
1273 | + sub %rdi, %rax | |
1274 | + bsf %rdx, %rdx | |
1275 | + lea 32(%rdx,%rax), %rax | |
1276 | + ret | |
1277 | + .p2align 4 | |
1278 | +L(exit48): | |
1279 | + sub %rdi, %rax | |
1280 | + bsf %rdx, %rdx | |
1281 | + lea 48(%rdx,%rax), %rax | |
1282 | ret | |
1283 | END(strlen) | |
1284 | libc_hidden_builtin_def (strlen) |