]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/strcat-sse2.S
Fix x86 strncat optimized implementation for large sizes
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / strcat-sse2.S
1 /* strcat with SSE2
2 Copyright (C) 2011-2017 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20
21 #if IS_IN (libc)
22
23 # include <sysdep.h>
24
25
26 # define CFI_PUSH(REG) \
27 cfi_adjust_cfa_offset (4); \
28 cfi_rel_offset (REG, 0)
29
30 # define CFI_POP(REG) \
31 cfi_adjust_cfa_offset (-4); \
32 cfi_restore (REG)
33
34 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
35 # define POP(REG) popl REG; CFI_POP (REG)
36
37 # ifdef SHARED
38 # define JMPTBL(I, B) I - B
39
40 /* Load an entry in a jump table into ECX and branch to it. TABLE is a
41 jump table with relative offsets. INDEX is a register contains the
42 index into the jump table. SCALE is the scale of INDEX. */
43
44 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
45 /* We first load PC into ECX. */ \
46 SETUP_PIC_REG(cx); \
47 /* Get the address of the jump table. */ \
48 addl $(TABLE - .), %ecx; \
49 /* Get the entry and convert the relative offset to the \
50 absolute address. */ \
51 addl (%ecx,INDEX,SCALE), %ecx; \
52 /* We loaded the jump table and adjusted ECX. Go. */ \
53 jmp *%ecx
54 # else
55 # define JMPTBL(I, B) I
56
57 /* Branch to an entry in a jump table. TABLE is a jump table with
58 absolute offsets. INDEX is a register contains the index into the
59 jump table. SCALE is the scale of INDEX. */
60
61 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
62 jmp *TABLE(,INDEX,SCALE)
63 # endif
64
65 # ifndef STRCAT
66 # define STRCAT __strcat_sse2
67 # endif
68
69 # define PARMS 4
70 # define STR1 PARMS+4
71 # define STR2 STR1+4
72
73 # ifdef USE_AS_STRNCAT
74 # define LEN STR2+8
75 # define STR3 STR1+4
76 # else
77 # define STR3 STR1
78 # endif
79
80 # define USE_AS_STRCAT
81 # ifdef USE_AS_STRNCAT
82 # define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
83 # else
84 # define RETURN POP(%esi); ret; CFI_PUSH(%esi);
85 # endif
86
87 .text
88 ENTRY (STRCAT)
89 PUSH (%esi)
90 mov STR1(%esp), %eax
91 mov STR2(%esp), %esi
92 # ifdef USE_AS_STRNCAT
93 PUSH (%ebx)
94 movl LEN(%esp), %ebx
95 test %ebx, %ebx
96 jz L(ExitZero)
97 # endif
98 cmpb $0, (%esi)
99 mov %esi, %ecx
100 mov %eax, %edx
101 jz L(ExitZero)
102
103 and $63, %ecx
104 and $63, %edx
105 cmp $32, %ecx
106 ja L(StrlenCore7_1)
107 cmp $48, %edx
108 ja L(alignment_prolog)
109
110 pxor %xmm0, %xmm0
111 pxor %xmm4, %xmm4
112 pxor %xmm7, %xmm7
113 movdqu (%eax), %xmm1
114 movdqu (%esi), %xmm5
115 pcmpeqb %xmm1, %xmm0
116 movdqu 16(%esi), %xmm6
117 pmovmskb %xmm0, %ecx
118 pcmpeqb %xmm5, %xmm4
119 pcmpeqb %xmm6, %xmm7
120 test %ecx, %ecx
121 jnz L(exit_less16_)
122 mov %eax, %ecx
123 and $-16, %eax
124 jmp L(loop_prolog)
125
126 L(alignment_prolog):
127 pxor %xmm0, %xmm0
128 pxor %xmm4, %xmm4
129 mov %edx, %ecx
130 pxor %xmm7, %xmm7
131 and $15, %ecx
132 and $-16, %eax
133 pcmpeqb (%eax), %xmm0
134 movdqu (%esi), %xmm5
135 movdqu 16(%esi), %xmm6
136 pmovmskb %xmm0, %edx
137 pcmpeqb %xmm5, %xmm4
138 shr %cl, %edx
139 pcmpeqb %xmm6, %xmm7
140 test %edx, %edx
141 jnz L(exit_less16)
142 add %eax, %ecx
143
144 pxor %xmm0, %xmm0
145 L(loop_prolog):
146 pxor %xmm1, %xmm1
147 pxor %xmm2, %xmm2
148 pxor %xmm3, %xmm3
149 .p2align 4
150 L(align16_loop):
151 pcmpeqb 16(%eax), %xmm0
152 pmovmskb %xmm0, %edx
153 test %edx, %edx
154 jnz L(exit16)
155
156 pcmpeqb 32(%eax), %xmm1
157 pmovmskb %xmm1, %edx
158 test %edx, %edx
159 jnz L(exit32)
160
161 pcmpeqb 48(%eax), %xmm2
162 pmovmskb %xmm2, %edx
163 test %edx, %edx
164 jnz L(exit48)
165
166 pcmpeqb 64(%eax), %xmm3
167 pmovmskb %xmm3, %edx
168 lea 64(%eax), %eax
169 test %edx, %edx
170 jz L(align16_loop)
171 bsf %edx, %edx
172 add %edx, %eax
173 jmp L(StartStrcpyPart)
174
175 .p2align 4
176 L(exit16):
177 bsf %edx, %edx
178 lea 16(%eax, %edx), %eax
179 jmp L(StartStrcpyPart)
180
181 .p2align 4
182 L(exit32):
183 bsf %edx, %edx
184 lea 32(%eax, %edx), %eax
185 jmp L(StartStrcpyPart)
186
187 .p2align 4
188 L(exit48):
189 bsf %edx, %edx
190 lea 48(%eax, %edx), %eax
191 jmp L(StartStrcpyPart)
192
193 .p2align 4
194 L(exit_less16):
195 bsf %edx, %edx
196 add %ecx, %eax
197 add %edx, %eax
198 jmp L(StartStrcpyPart)
199
200 .p2align 4
201 L(exit_less16_):
202 bsf %ecx, %ecx
203 add %ecx, %eax
204
205 .p2align 4
206 L(StartStrcpyPart):
207 pmovmskb %xmm4, %edx
208 # ifdef USE_AS_STRNCAT
209 cmp $16, %ebx
210 jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
211 # endif
212 test %edx, %edx
213 jnz L(CopyFrom1To16BytesTail1)
214
215 movdqu %xmm5, (%eax)
216 pmovmskb %xmm7, %edx
217 # ifdef USE_AS_STRNCAT
218 cmp $32, %ebx
219 jbe L(CopyFrom1To32Bytes1Case2OrCase3)
220 # endif
221 test %edx, %edx
222 jnz L(CopyFrom1To32Bytes1)
223
224 mov %esi, %ecx
225 and $-16, %esi
226 and $15, %ecx
227 pxor %xmm0, %xmm0
228 # ifdef USE_AS_STRNCAT
229 add %ecx, %ebx
230 sbb %edx, %edx
231 or %edx, %ebx
232 # endif
233 sub %ecx, %eax
234 jmp L(Unalign16Both)
235
236 L(StrlenCore7_1):
237 mov %eax, %ecx
238 pxor %xmm0, %xmm0
239 and $15, %ecx
240 and $-16, %eax
241 pcmpeqb (%eax), %xmm0
242 pmovmskb %xmm0, %edx
243 shr %cl, %edx
244 test %edx, %edx
245 jnz L(exit_less16_1)
246 add %eax, %ecx
247
248 pxor %xmm0, %xmm0
249 pxor %xmm1, %xmm1
250 pxor %xmm2, %xmm2
251 pxor %xmm3, %xmm3
252
253 .p2align 4
254 L(align16_loop_1):
255 pcmpeqb 16(%eax), %xmm0
256 pmovmskb %xmm0, %edx
257 test %edx, %edx
258 jnz L(exit16_1)
259
260 pcmpeqb 32(%eax), %xmm1
261 pmovmskb %xmm1, %edx
262 test %edx, %edx
263 jnz L(exit32_1)
264
265 pcmpeqb 48(%eax), %xmm2
266 pmovmskb %xmm2, %edx
267 test %edx, %edx
268 jnz L(exit48_1)
269
270 pcmpeqb 64(%eax), %xmm3
271 pmovmskb %xmm3, %edx
272 lea 64(%eax), %eax
273 test %edx, %edx
274 jz L(align16_loop_1)
275 bsf %edx, %edx
276 add %edx, %eax
277 jmp L(StartStrcpyPart_1)
278
279 .p2align 4
280 L(exit16_1):
281 bsf %edx, %edx
282 lea 16(%eax, %edx), %eax
283 jmp L(StartStrcpyPart_1)
284
285 .p2align 4
286 L(exit32_1):
287 bsf %edx, %edx
288 lea 32(%eax, %edx), %eax
289 jmp L(StartStrcpyPart_1)
290
291 .p2align 4
292 L(exit48_1):
293 bsf %edx, %edx
294 lea 48(%eax, %edx), %eax
295 jmp L(StartStrcpyPart_1)
296
297 .p2align 4
298 L(exit_less16_1):
299 bsf %edx, %edx
300 add %ecx, %eax
301 add %edx, %eax
302
303 .p2align 4
304 L(StartStrcpyPart_1):
305 mov %esi, %ecx
306 and $15, %ecx
307 and $-16, %esi
308 pxor %xmm0, %xmm0
309 pxor %xmm1, %xmm1
310
311 # ifdef USE_AS_STRNCAT
312 cmp $48, %ebx
313 ja L(BigN)
314 # endif
315 pcmpeqb (%esi), %xmm1
316 # ifdef USE_AS_STRNCAT
317 add %ecx, %ebx
318 # endif
319 pmovmskb %xmm1, %edx
320 shr %cl, %edx
321 # ifdef USE_AS_STRNCAT
322 cmp $16, %ebx
323 jbe L(CopyFrom1To16BytesTailCase2OrCase3)
324 # endif
325 test %edx, %edx
326 jnz L(CopyFrom1To16BytesTail)
327
328 pcmpeqb 16(%esi), %xmm0
329 pmovmskb %xmm0, %edx
330 # ifdef USE_AS_STRNCAT
331 cmp $32, %ebx
332 jbe L(CopyFrom1To32BytesCase2OrCase3)
333 # endif
334 test %edx, %edx
335 jnz L(CopyFrom1To32Bytes)
336
337 movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
338 movdqu %xmm1, (%eax)
339 sub %ecx, %eax
340
341 .p2align 4
342 L(Unalign16Both):
343 mov $16, %ecx
344 movdqa (%esi, %ecx), %xmm1
345 movaps 16(%esi, %ecx), %xmm2
346 movdqu %xmm1, (%eax, %ecx)
347 pcmpeqb %xmm2, %xmm0
348 pmovmskb %xmm0, %edx
349 add $16, %ecx
350 # ifdef USE_AS_STRNCAT
351 sub $48, %ebx
352 jbe L(CopyFrom1To16BytesCase2OrCase3)
353 # endif
354 test %edx, %edx
355 jnz L(CopyFrom1To16Bytes)
356 L(Unalign16BothBigN):
357 movaps 16(%esi, %ecx), %xmm3
358 movdqu %xmm2, (%eax, %ecx)
359 pcmpeqb %xmm3, %xmm0
360 pmovmskb %xmm0, %edx
361 add $16, %ecx
362 # ifdef USE_AS_STRNCAT
363 sub $16, %ebx
364 jbe L(CopyFrom1To16BytesCase2OrCase3)
365 # endif
366 test %edx, %edx
367 jnz L(CopyFrom1To16Bytes)
368
369 movaps 16(%esi, %ecx), %xmm4
370 movdqu %xmm3, (%eax, %ecx)
371 pcmpeqb %xmm4, %xmm0
372 pmovmskb %xmm0, %edx
373 add $16, %ecx
374 # ifdef USE_AS_STRNCAT
375 sub $16, %ebx
376 jbe L(CopyFrom1To16BytesCase2OrCase3)
377 # endif
378 test %edx, %edx
379 jnz L(CopyFrom1To16Bytes)
380
381 movaps 16(%esi, %ecx), %xmm1
382 movdqu %xmm4, (%eax, %ecx)
383 pcmpeqb %xmm1, %xmm0
384 pmovmskb %xmm0, %edx
385 add $16, %ecx
386 # ifdef USE_AS_STRNCAT
387 sub $16, %ebx
388 jbe L(CopyFrom1To16BytesCase2OrCase3)
389 # endif
390 test %edx, %edx
391 jnz L(CopyFrom1To16Bytes)
392
393 movaps 16(%esi, %ecx), %xmm2
394 movdqu %xmm1, (%eax, %ecx)
395 pcmpeqb %xmm2, %xmm0
396 pmovmskb %xmm0, %edx
397 add $16, %ecx
398 # ifdef USE_AS_STRNCAT
399 sub $16, %ebx
400 jbe L(CopyFrom1To16BytesCase2OrCase3)
401 # endif
402 test %edx, %edx
403 jnz L(CopyFrom1To16Bytes)
404
405 movaps 16(%esi, %ecx), %xmm3
406 movdqu %xmm2, (%eax, %ecx)
407 pcmpeqb %xmm3, %xmm0
408 pmovmskb %xmm0, %edx
409 add $16, %ecx
410 # ifdef USE_AS_STRNCAT
411 sub $16, %ebx
412 jbe L(CopyFrom1To16BytesCase2OrCase3)
413 # endif
414 test %edx, %edx
415 jnz L(CopyFrom1To16Bytes)
416
417 movdqu %xmm3, (%eax, %ecx)
418 mov %esi, %edx
419 lea 16(%esi, %ecx), %esi
420 and $-0x40, %esi
421 sub %esi, %edx
422 sub %edx, %eax
423 # ifdef USE_AS_STRNCAT
424 lea 128(%ebx, %edx), %ebx
425 # endif
426 movaps (%esi), %xmm2
427 movaps %xmm2, %xmm4
428 movaps 16(%esi), %xmm5
429 movaps 32(%esi), %xmm3
430 movaps %xmm3, %xmm6
431 movaps 48(%esi), %xmm7
432 pminub %xmm5, %xmm2
433 pminub %xmm7, %xmm3
434 pminub %xmm2, %xmm3
435 pcmpeqb %xmm0, %xmm3
436 pmovmskb %xmm3, %edx
437 # ifdef USE_AS_STRNCAT
438 sub $64, %ebx
439 jbe L(UnalignedLeaveCase2OrCase3)
440 # endif
441 test %edx, %edx
442 jnz L(Unaligned64Leave)
443
444 .p2align 4
445 L(Unaligned64Loop_start):
446 add $64, %eax
447 add $64, %esi
448 movdqu %xmm4, -64(%eax)
449 movaps (%esi), %xmm2
450 movdqa %xmm2, %xmm4
451 movdqu %xmm5, -48(%eax)
452 movaps 16(%esi), %xmm5
453 pminub %xmm5, %xmm2
454 movaps 32(%esi), %xmm3
455 movdqu %xmm6, -32(%eax)
456 movaps %xmm3, %xmm6
457 movdqu %xmm7, -16(%eax)
458 movaps 48(%esi), %xmm7
459 pminub %xmm7, %xmm3
460 pminub %xmm2, %xmm3
461 pcmpeqb %xmm0, %xmm3
462 pmovmskb %xmm3, %edx
463 # ifdef USE_AS_STRNCAT
464 sub $64, %ebx
465 jbe L(UnalignedLeaveCase2OrCase3)
466 # endif
467 test %edx, %edx
468 jz L(Unaligned64Loop_start)
469
470 L(Unaligned64Leave):
471 pxor %xmm1, %xmm1
472
473 pcmpeqb %xmm4, %xmm0
474 pcmpeqb %xmm5, %xmm1
475 pmovmskb %xmm0, %edx
476 pmovmskb %xmm1, %ecx
477 test %edx, %edx
478 jnz L(CopyFrom1To16BytesUnaligned_0)
479 test %ecx, %ecx
480 jnz L(CopyFrom1To16BytesUnaligned_16)
481
482 pcmpeqb %xmm6, %xmm0
483 pcmpeqb %xmm7, %xmm1
484 pmovmskb %xmm0, %edx
485 pmovmskb %xmm1, %ecx
486 test %edx, %edx
487 jnz L(CopyFrom1To16BytesUnaligned_32)
488
489 bsf %ecx, %edx
490 movdqu %xmm4, (%eax)
491 movdqu %xmm5, 16(%eax)
492 movdqu %xmm6, 32(%eax)
493 add $48, %esi
494 add $48, %eax
495 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
496
497 # ifdef USE_AS_STRNCAT
498 .p2align 4
499 L(BigN):
500 pcmpeqb (%esi), %xmm1
501 pmovmskb %xmm1, %edx
502 shr %cl, %edx
503 test %edx, %edx
504 jnz L(CopyFrom1To16BytesTail)
505
506 pcmpeqb 16(%esi), %xmm0
507 pmovmskb %xmm0, %edx
508 test %edx, %edx
509 jnz L(CopyFrom1To32Bytes)
510
511 movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
512 movdqu %xmm1, (%eax)
513 sub %ecx, %eax
514 sub $48, %ebx
515 add %ecx, %ebx
516
517 mov $16, %ecx
518 movdqa (%esi, %ecx), %xmm1
519 movaps 16(%esi, %ecx), %xmm2
520 movdqu %xmm1, (%eax, %ecx)
521 pcmpeqb %xmm2, %xmm0
522 pmovmskb %xmm0, %edx
523 add $16, %ecx
524 test %edx, %edx
525 jnz L(CopyFrom1To16Bytes)
526 jmp L(Unalign16BothBigN)
527 # endif
528
529 /*------------end of main part-------------------------------*/
530
531 /* Case1 */
532 .p2align 4
533 L(CopyFrom1To16Bytes):
534 add %ecx, %eax
535 add %ecx, %esi
536 bsf %edx, %edx
537 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
538
539 .p2align 4
540 L(CopyFrom1To16BytesTail):
541 add %ecx, %esi
542 bsf %edx, %edx
543 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
544
545 .p2align 4
546 L(CopyFrom1To32Bytes1):
547 add $16, %esi
548 add $16, %eax
549 L(CopyFrom1To16BytesTail1):
550 bsf %edx, %edx
551 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
552
553 .p2align 4
554 L(CopyFrom1To32Bytes):
555 bsf %edx, %edx
556 add %ecx, %esi
557 add $16, %edx
558 sub %ecx, %edx
559 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
560
561 .p2align 4
562 L(CopyFrom1To16BytesUnaligned_0):
563 bsf %edx, %edx
564 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
565
566 .p2align 4
567 L(CopyFrom1To16BytesUnaligned_16):
568 bsf %ecx, %edx
569 movdqu %xmm4, (%eax)
570 add $16, %esi
571 add $16, %eax
572 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
573
574 .p2align 4
575 L(CopyFrom1To16BytesUnaligned_32):
576 bsf %edx, %edx
577 movdqu %xmm4, (%eax)
578 movdqu %xmm5, 16(%eax)
579 add $32, %esi
580 add $32, %eax
581 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
582
583 # ifdef USE_AS_STRNCAT
584
585 .p2align 4
586 L(CopyFrom1To16BytesExit):
587 BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
588
589 /* Case2 */
590
591 .p2align 4
592 L(CopyFrom1To16BytesCase2):
593 add $16, %ebx
594 add %ecx, %eax
595 add %ecx, %esi
596 bsf %edx, %edx
597 cmp %ebx, %edx
598 jb L(CopyFrom1To16BytesExit)
599 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
600
601 .p2align 4
602 L(CopyFrom1To32BytesCase2):
603 sub %ecx, %ebx
604 add %ecx, %esi
605 bsf %edx, %edx
606 add $16, %edx
607 sub %ecx, %edx
608 cmp %ebx, %edx
609 jb L(CopyFrom1To16BytesExit)
610 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
611
612 L(CopyFrom1To16BytesTailCase2):
613 sub %ecx, %ebx
614 add %ecx, %esi
615 bsf %edx, %edx
616 cmp %ebx, %edx
617 jb L(CopyFrom1To16BytesExit)
618 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
619
620 L(CopyFrom1To16BytesTail1Case2):
621 bsf %edx, %edx
622 cmp %ebx, %edx
623 jb L(CopyFrom1To16BytesExit)
624 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
625
626 /* Case2 or Case3, Case3 */
627
628 .p2align 4
629 L(CopyFrom1To16BytesCase2OrCase3):
630 test %edx, %edx
631 jnz L(CopyFrom1To16BytesCase2)
632 L(CopyFrom1To16BytesCase3):
633 add $16, %ebx
634 add %ecx, %eax
635 add %ecx, %esi
636 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
637
638 .p2align 4
639 L(CopyFrom1To32BytesCase2OrCase3):
640 test %edx, %edx
641 jnz L(CopyFrom1To32BytesCase2)
642 sub %ecx, %ebx
643 add %ecx, %esi
644 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
645
646 .p2align 4
647 L(CopyFrom1To16BytesTailCase2OrCase3):
648 test %edx, %edx
649 jnz L(CopyFrom1To16BytesTailCase2)
650 sub %ecx, %ebx
651 add %ecx, %esi
652 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
653
654 .p2align 4
655 L(CopyFrom1To32Bytes1Case2OrCase3):
656 add $16, %eax
657 add $16, %esi
658 sub $16, %ebx
659 L(CopyFrom1To16BytesTail1Case2OrCase3):
660 test %edx, %edx
661 jnz L(CopyFrom1To16BytesTail1Case2)
662 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
663
664 # endif
665
666 # ifdef USE_AS_STRNCAT
667 .p2align 4
668 L(StrncatExit0):
669 movb %bh, (%eax)
670 mov STR3(%esp), %eax
671 RETURN
672 # endif
673
674 .p2align 4
675 # ifdef USE_AS_STRNCAT
676 L(StrncatExit1):
677 movb %bh, 1(%eax)
678 # endif
679 L(Exit1):
680 # ifdef USE_AS_STRNCAT
681 movb (%esi), %dh
682 # endif
683 movb %dh, (%eax)
684 mov STR3(%esp), %eax
685 RETURN
686
687 .p2align 4
688 # ifdef USE_AS_STRNCAT
689 L(StrncatExit2):
690 movb %bh, 2(%eax)
691 # endif
692 L(Exit2):
693 movw (%esi), %dx
694 movw %dx, (%eax)
695 mov STR3(%esp), %eax
696 RETURN
697
698 .p2align 4
699 # ifdef USE_AS_STRNCAT
700 L(StrncatExit3):
701 movb %bh, 3(%eax)
702 # endif
703 L(Exit3):
704 movw (%esi), %cx
705 movw %cx, (%eax)
706 # ifdef USE_AS_STRNCAT
707 movb 2(%esi), %dh
708 # endif
709 movb %dh, 2(%eax)
710 mov STR3(%esp), %eax
711 RETURN
712
713 .p2align 4
714 # ifdef USE_AS_STRNCAT
715 L(StrncatExit4):
716 movb %bh, 4(%eax)
717 # endif
718 L(Exit4):
719 movl (%esi), %edx
720 movl %edx, (%eax)
721 mov STR3(%esp), %eax
722 RETURN
723
724 .p2align 4
725 # ifdef USE_AS_STRNCAT
726 L(StrncatExit5):
727 movb %bh, 5(%eax)
728 # endif
729 L(Exit5):
730 movl (%esi), %ecx
731 # ifdef USE_AS_STRNCAT
732 movb 4(%esi), %dh
733 # endif
734 movb %dh, 4(%eax)
735 movl %ecx, (%eax)
736 mov STR3(%esp), %eax
737 RETURN
738
739 .p2align 4
740 # ifdef USE_AS_STRNCAT
741 L(StrncatExit6):
742 movb %bh, 6(%eax)
743 # endif
744 L(Exit6):
745 movl (%esi), %ecx
746 movw 4(%esi), %dx
747 movl %ecx, (%eax)
748 movw %dx, 4(%eax)
749 mov STR3(%esp), %eax
750 RETURN
751
752 .p2align 4
753 # ifdef USE_AS_STRNCAT
754 L(StrncatExit7):
755 movb %bh, 7(%eax)
756 # endif
757 L(Exit7):
758 movl (%esi), %ecx
759 movl 3(%esi), %edx
760 movl %ecx, (%eax)
761 movl %edx, 3(%eax)
762 mov STR3(%esp), %eax
763 RETURN
764
765 .p2align 4
766 # ifdef USE_AS_STRNCAT
767 L(StrncatExit8):
768 movb %bh, 8(%eax)
769 # endif
770 L(Exit8):
771 movlpd (%esi), %xmm0
772 movlpd %xmm0, (%eax)
773 mov STR3(%esp), %eax
774 RETURN
775
776 .p2align 4
777 # ifdef USE_AS_STRNCAT
778 L(StrncatExit9):
779 movb %bh, 9(%eax)
780 # endif
781 L(Exit9):
782 movlpd (%esi), %xmm0
783 # ifdef USE_AS_STRNCAT
784 movb 8(%esi), %dh
785 # endif
786 movb %dh, 8(%eax)
787 movlpd %xmm0, (%eax)
788 mov STR3(%esp), %eax
789 RETURN
790
791 .p2align 4
792 # ifdef USE_AS_STRNCAT
793 L(StrncatExit10):
794 movb %bh, 10(%eax)
795 # endif
796 L(Exit10):
797 movlpd (%esi), %xmm0
798 movw 8(%esi), %dx
799 movlpd %xmm0, (%eax)
800 movw %dx, 8(%eax)
801 mov STR3(%esp), %eax
802 RETURN
803
804 .p2align 4
805 # ifdef USE_AS_STRNCAT
806 L(StrncatExit11):
807 movb %bh, 11(%eax)
808 # endif
809 L(Exit11):
810 movlpd (%esi), %xmm0
811 movl 7(%esi), %edx
812 movlpd %xmm0, (%eax)
813 movl %edx, 7(%eax)
814 mov STR3(%esp), %eax
815 RETURN
816
817 .p2align 4
818 # ifdef USE_AS_STRNCAT
819 L(StrncatExit12):
820 movb %bh, 12(%eax)
821 # endif
822 L(Exit12):
823 movlpd (%esi), %xmm0
824 movl 8(%esi), %edx
825 movlpd %xmm0, (%eax)
826 movl %edx, 8(%eax)
827 mov STR3(%esp), %eax
828 RETURN
829
830 .p2align 4
831 # ifdef USE_AS_STRNCAT
832 L(StrncatExit13):
833 movb %bh, 13(%eax)
834 # endif
835 L(Exit13):
836 movlpd (%esi), %xmm0
837 movlpd 5(%esi), %xmm1
838 movlpd %xmm0, (%eax)
839 movlpd %xmm1, 5(%eax)
840 mov STR3(%esp), %eax
841 RETURN
842
843 .p2align 4
844 # ifdef USE_AS_STRNCAT
845 L(StrncatExit14):
846 movb %bh, 14(%eax)
847 # endif
848 L(Exit14):
849 movlpd (%esi), %xmm0
850 movlpd 6(%esi), %xmm1
851 movlpd %xmm0, (%eax)
852 movlpd %xmm1, 6(%eax)
853 mov STR3(%esp), %eax
854 RETURN
855
856 .p2align 4
857 # ifdef USE_AS_STRNCAT
858 L(StrncatExit15):
859 movb %bh, 15(%eax)
860 # endif
861 L(Exit15):
862 movlpd (%esi), %xmm0
863 movlpd 7(%esi), %xmm1
864 movlpd %xmm0, (%eax)
865 movlpd %xmm1, 7(%eax)
866 mov STR3(%esp), %eax
867 RETURN
868
869 .p2align 4
870 # ifdef USE_AS_STRNCAT
871 L(StrncatExit16):
872 movb %bh, 16(%eax)
873 # endif
874 L(Exit16):
875 movdqu (%esi), %xmm0
876 movdqu %xmm0, (%eax)
877 mov STR3(%esp), %eax
878 RETURN
879
880 .p2align 4
881 # ifdef USE_AS_STRNCAT
882 L(StrncatExit17):
883 movb %bh, 17(%eax)
884 # endif
885 L(Exit17):
886 movdqu (%esi), %xmm0
887 # ifdef USE_AS_STRNCAT
888 movb 16(%esi), %dh
889 # endif
890 movdqu %xmm0, (%eax)
891 movb %dh, 16(%eax)
892 mov STR3(%esp), %eax
893 RETURN
894
895 .p2align 4
896 # ifdef USE_AS_STRNCAT
897 L(StrncatExit18):
898 movb %bh, 18(%eax)
899 # endif
900 L(Exit18):
901 movdqu (%esi), %xmm0
902 movw 16(%esi), %cx
903 movdqu %xmm0, (%eax)
904 movw %cx, 16(%eax)
905 mov STR3(%esp), %eax
906 RETURN
907
908 .p2align 4
909 # ifdef USE_AS_STRNCAT
910 L(StrncatExit19):
911 movb %bh, 19(%eax)
912 # endif
913 L(Exit19):
914 movdqu (%esi), %xmm0
915 movl 15(%esi), %ecx
916 movdqu %xmm0, (%eax)
917 movl %ecx, 15(%eax)
918 mov STR3(%esp), %eax
919 RETURN
920
921 .p2align 4
922 # ifdef USE_AS_STRNCAT
923 L(StrncatExit20):
924 movb %bh, 20(%eax)
925 # endif
926 L(Exit20):
927 movdqu (%esi), %xmm0
928 movl 16(%esi), %ecx
929 movdqu %xmm0, (%eax)
930 movl %ecx, 16(%eax)
931 mov STR3(%esp), %eax
932 RETURN
933
934 .p2align 4
935 # ifdef USE_AS_STRNCAT
936 L(StrncatExit21):
937 movb %bh, 21(%eax)
938 # endif
939 L(Exit21):
940 movdqu (%esi), %xmm0
941 movl 16(%esi), %ecx
942 # ifdef USE_AS_STRNCAT
943 movb 20(%esi), %dh
944 # endif
945 movdqu %xmm0, (%eax)
946 movl %ecx, 16(%eax)
947 movb %dh, 20(%eax)
948 mov STR3(%esp), %eax
949 RETURN
950
951 .p2align 4
952 # ifdef USE_AS_STRNCAT
953 L(StrncatExit22):
954 movb %bh, 22(%eax)
955 # endif
956 L(Exit22):
957 movdqu (%esi), %xmm0
958 movlpd 14(%esi), %xmm3
959 movdqu %xmm0, (%eax)
960 movlpd %xmm3, 14(%eax)
961 mov STR3(%esp), %eax
962 RETURN
963
964 .p2align 4
965 # ifdef USE_AS_STRNCAT
966 L(StrncatExit23):
967 movb %bh, 23(%eax)
968 # endif
969 L(Exit23):
970 movdqu (%esi), %xmm0
971 movlpd 15(%esi), %xmm3
972 movdqu %xmm0, (%eax)
973 movlpd %xmm3, 15(%eax)
974 mov STR3(%esp), %eax
975 RETURN
976
977 .p2align 4
978 # ifdef USE_AS_STRNCAT
979 L(StrncatExit24):
980 movb %bh, 24(%eax)
981 # endif
982 L(Exit24):
983 movdqu (%esi), %xmm0
984 movlpd 16(%esi), %xmm2
985 movdqu %xmm0, (%eax)
986 movlpd %xmm2, 16(%eax)
987 mov STR3(%esp), %eax
988 RETURN
989
990 .p2align 4
991 # ifdef USE_AS_STRNCAT
992 L(StrncatExit25):
993 movb %bh, 25(%eax)
994 # endif
995 L(Exit25):
996 movdqu (%esi), %xmm0
997 movlpd 16(%esi), %xmm2
998 # ifdef USE_AS_STRNCAT
999 movb 24(%esi), %dh
1000 # endif
1001 movdqu %xmm0, (%eax)
1002 movlpd %xmm2, 16(%eax)
1003 movb %dh, 24(%eax)
1004 mov STR3(%esp), %eax
1005 RETURN
1006
1007 .p2align 4
1008 # ifdef USE_AS_STRNCAT
1009 L(StrncatExit26):
1010 movb %bh, 26(%eax)
1011 # endif
1012 L(Exit26):
1013 movdqu (%esi), %xmm0
1014 movlpd 16(%esi), %xmm2
1015 movw 24(%esi), %cx
1016 movdqu %xmm0, (%eax)
1017 movlpd %xmm2, 16(%eax)
1018 movw %cx, 24(%eax)
1019 mov STR3(%esp), %eax
1020 RETURN
1021
1022 .p2align 4
1023 # ifdef USE_AS_STRNCAT
1024 L(StrncatExit27):
1025 movb %bh, 27(%eax)
1026 # endif
1027 L(Exit27):
1028 movdqu (%esi), %xmm0
1029 movlpd 16(%esi), %xmm2
1030 movl 23(%esi), %ecx
1031 movdqu %xmm0, (%eax)
1032 movlpd %xmm2, 16(%eax)
1033 movl %ecx, 23(%eax)
1034 mov STR3(%esp), %eax
1035 RETURN
1036
1037 .p2align 4
1038 # ifdef USE_AS_STRNCAT
1039 L(StrncatExit28):
1040 movb %bh, 28(%eax)
1041 # endif
1042 L(Exit28):
1043 movdqu (%esi), %xmm0
1044 movlpd 16(%esi), %xmm2
1045 movl 24(%esi), %ecx
1046 movdqu %xmm0, (%eax)
1047 movlpd %xmm2, 16(%eax)
1048 movl %ecx, 24(%eax)
1049 mov STR3(%esp), %eax
1050 RETURN
1051
1052 .p2align 4
1053 # ifdef USE_AS_STRNCAT
1054 L(StrncatExit29):
1055 movb %bh, 29(%eax)
1056 # endif
1057 L(Exit29):
1058 movdqu (%esi), %xmm0
1059 movdqu 13(%esi), %xmm2
1060 movdqu %xmm0, (%eax)
1061 movdqu %xmm2, 13(%eax)
1062 mov STR3(%esp), %eax
1063 RETURN
1064
1065 .p2align 4
1066 # ifdef USE_AS_STRNCAT
1067 L(StrncatExit30):
1068 movb %bh, 30(%eax)
1069 # endif
1070 L(Exit30):
1071 movdqu (%esi), %xmm0
1072 movdqu 14(%esi), %xmm2
1073 movdqu %xmm0, (%eax)
1074 movdqu %xmm2, 14(%eax)
1075 mov STR3(%esp), %eax
1076 RETURN
1077
1078 .p2align 4
1079 # ifdef USE_AS_STRNCAT
1080 L(StrncatExit31):
1081 movb %bh, 31(%eax)
1082 # endif
1083 L(Exit31):
1084 movdqu (%esi), %xmm0
1085 movdqu 15(%esi), %xmm2
1086 movdqu %xmm0, (%eax)
1087 movdqu %xmm2, 15(%eax)
1088 mov STR3(%esp), %eax
1089 RETURN
1090
1091 .p2align 4
1092 # ifdef USE_AS_STRNCAT
1093 L(StrncatExit32):
1094 movb %bh, 32(%eax)
1095 # endif
1096 L(Exit32):
1097 movdqu (%esi), %xmm0
1098 movdqu 16(%esi), %xmm2
1099 movdqu %xmm0, (%eax)
1100 movdqu %xmm2, 16(%eax)
1101 mov STR3(%esp), %eax
1102 RETURN
1103
1104 # ifdef USE_AS_STRNCAT
1105
1106 .p2align 4
1107 L(UnalignedLeaveCase2OrCase3):
1108 test %edx, %edx
1109 jnz L(Unaligned64LeaveCase2)
1110 L(Unaligned64LeaveCase3):
1111 lea 64(%ebx), %ecx
1112 and $-16, %ecx
1113 add $48, %ebx
1114 jl L(CopyFrom1To16BytesCase3)
1115 movdqu %xmm4, (%eax)
1116 sub $16, %ebx
1117 jb L(CopyFrom1To16BytesCase3)
1118 movdqu %xmm5, 16(%eax)
1119 sub $16, %ebx
1120 jb L(CopyFrom1To16BytesCase3)
1121 movdqu %xmm6, 32(%eax)
1122 sub $16, %ebx
1123 jb L(CopyFrom1To16BytesCase3)
1124 movdqu %xmm7, 48(%eax)
1125 xor %bh, %bh
1126 movb %bh, 64(%eax)
1127 mov STR3(%esp), %eax
1128 RETURN
1129
1130 .p2align 4
1131 L(Unaligned64LeaveCase2):
1132 xor %ecx, %ecx
1133 pcmpeqb %xmm4, %xmm0
1134 pmovmskb %xmm0, %edx
1135 add $48, %ebx
1136 jle L(CopyFrom1To16BytesCase2OrCase3)
1137 test %edx, %edx
1138 jnz L(CopyFrom1To16Bytes)
1139
1140 pcmpeqb %xmm5, %xmm0
1141 pmovmskb %xmm0, %edx
1142 movdqu %xmm4, (%eax)
1143 add $16, %ecx
1144 sub $16, %ebx
1145 jbe L(CopyFrom1To16BytesCase2OrCase3)
1146 test %edx, %edx
1147 jnz L(CopyFrom1To16Bytes)
1148
1149 pcmpeqb %xmm6, %xmm0
1150 pmovmskb %xmm0, %edx
1151 movdqu %xmm5, 16(%eax)
1152 add $16, %ecx
1153 sub $16, %ebx
1154 jbe L(CopyFrom1To16BytesCase2OrCase3)
1155 test %edx, %edx
1156 jnz L(CopyFrom1To16Bytes)
1157
1158 pcmpeqb %xmm7, %xmm0
1159 pmovmskb %xmm0, %edx
1160 movdqu %xmm6, 32(%eax)
1161 lea 16(%eax, %ecx), %eax
1162 lea 16(%esi, %ecx), %esi
1163 bsf %edx, %edx
1164 cmp %ebx, %edx
1165 jb L(CopyFrom1To16BytesExit)
1166 BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
1167 # endif
1168 .p2align 4
1169 L(ExitZero):
1170 RETURN
1171
1172 END (STRCAT)
1173
1174 .p2align 4
1175 .section .rodata
1176 L(ExitTable):
1177 .int JMPTBL(L(Exit1), L(ExitTable))
1178 .int JMPTBL(L(Exit2), L(ExitTable))
1179 .int JMPTBL(L(Exit3), L(ExitTable))
1180 .int JMPTBL(L(Exit4), L(ExitTable))
1181 .int JMPTBL(L(Exit5), L(ExitTable))
1182 .int JMPTBL(L(Exit6), L(ExitTable))
1183 .int JMPTBL(L(Exit7), L(ExitTable))
1184 .int JMPTBL(L(Exit8), L(ExitTable))
1185 .int JMPTBL(L(Exit9), L(ExitTable))
1186 .int JMPTBL(L(Exit10), L(ExitTable))
1187 .int JMPTBL(L(Exit11), L(ExitTable))
1188 .int JMPTBL(L(Exit12), L(ExitTable))
1189 .int JMPTBL(L(Exit13), L(ExitTable))
1190 .int JMPTBL(L(Exit14), L(ExitTable))
1191 .int JMPTBL(L(Exit15), L(ExitTable))
1192 .int JMPTBL(L(Exit16), L(ExitTable))
1193 .int JMPTBL(L(Exit17), L(ExitTable))
1194 .int JMPTBL(L(Exit18), L(ExitTable))
1195 .int JMPTBL(L(Exit19), L(ExitTable))
1196 .int JMPTBL(L(Exit20), L(ExitTable))
1197 .int JMPTBL(L(Exit21), L(ExitTable))
1198 .int JMPTBL(L(Exit22), L(ExitTable))
1199 .int JMPTBL(L(Exit23), L(ExitTable))
1200 .int JMPTBL(L(Exit24), L(ExitTable))
1201 .int JMPTBL(L(Exit25), L(ExitTable))
1202 .int JMPTBL(L(Exit26), L(ExitTable))
1203 .int JMPTBL(L(Exit27), L(ExitTable))
1204 .int JMPTBL(L(Exit28), L(ExitTable))
1205 .int JMPTBL(L(Exit29), L(ExitTable))
1206 .int JMPTBL(L(Exit30), L(ExitTable))
1207 .int JMPTBL(L(Exit31), L(ExitTable))
1208 .int JMPTBL(L(Exit32), L(ExitTable))
1209 # ifdef USE_AS_STRNCAT
1210 L(ExitStrncatTable):
1211 .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
1212 .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
1213 .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
1214 .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
1215 .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
1216 .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
1217 .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
1218 .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
1219 .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
1220 .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
1221 .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
1222 .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
1223 .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
1224 .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
1225 .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
1226 .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
1227 .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
1228 .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
1229 .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
1230 .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
1231 .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
1232 .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
1233 .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
1234 .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
1235 .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
1236 .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
1237 .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
1238 .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
1239 .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
1240 .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
1241 .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
1242 .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
1243 .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
1244 # endif
1245 #endif