]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / wcscpy-ssse3.S
CommitLineData
1d3e4b61 1/* wcscpy with SSSE3
04277e02 2 Copyright (C) 2011-2019 Free Software Foundation, Inc.
1d3e4b61
UD
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
1d3e4b61 19
4f41c682 20#if IS_IN (libc)
1d3e4b61
UD
21# include <sysdep.h>
22
23# define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS 4
35# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
36# define STR1 PARMS
37# define STR2 STR1+4
38# define LEN STR2+4
39
40 atom_text_section
41ENTRY (__wcscpy_ssse3)
42 mov STR1(%esp), %edx
43 mov STR2(%esp), %ecx
44
45 cmp $0, (%ecx)
46 jz L(ExitTail4)
47 cmp $0, 4(%ecx)
48 jz L(ExitTail8)
49 cmp $0, 8(%ecx)
50 jz L(ExitTail12)
51 cmp $0, 12(%ecx)
52 jz L(ExitTail16)
53
54 PUSH (%edi)
55 mov %edx, %edi
1d3e4b61
UD
56 PUSH (%esi)
57 lea 16(%ecx), %esi
58
59 and $-16, %esi
60
61 pxor %xmm0, %xmm0
62 pcmpeqd (%esi), %xmm0
63 movdqu (%ecx), %xmm1
64 movdqu %xmm1, (%edx)
65
66 pmovmskb %xmm0, %eax
67 sub %ecx, %esi
68
69 test %eax, %eax
70 jnz L(CopyFrom1To16Bytes)
71
72 mov %edx, %eax
73 lea 16(%edx), %edx
74 and $-16, %edx
75 sub %edx, %eax
76
77 sub %eax, %ecx
78 mov %ecx, %eax
79 and $0xf, %eax
80 mov $0, %esi
81
82 jz L(Align16Both)
83 cmp $4, %eax
84 je L(Shl4)
85 cmp $8, %eax
86 je L(Shl8)
87 jmp L(Shl12)
88
89L(Align16Both):
90 movaps (%ecx), %xmm1
91 movaps 16(%ecx), %xmm2
92 movaps %xmm1, (%edx)
93 pcmpeqd %xmm2, %xmm0
94 pmovmskb %xmm0, %eax
95 lea 16(%esi), %esi
96
97 test %eax, %eax
98 jnz L(CopyFrom1To16Bytes)
99
100 movaps 16(%ecx, %esi), %xmm3
101 movaps %xmm2, (%edx, %esi)
102 pcmpeqd %xmm3, %xmm0
103 pmovmskb %xmm0, %eax
104 lea 16(%esi), %esi
105
106 test %eax, %eax
107 jnz L(CopyFrom1To16Bytes)
108
109 movaps 16(%ecx, %esi), %xmm4
110 movaps %xmm3, (%edx, %esi)
111 pcmpeqd %xmm4, %xmm0
112 pmovmskb %xmm0, %eax
113 lea 16(%esi), %esi
114
115 test %eax, %eax
116 jnz L(CopyFrom1To16Bytes)
117
118 movaps 16(%ecx, %esi), %xmm1
119 movaps %xmm4, (%edx, %esi)
120 pcmpeqd %xmm1, %xmm0
121 pmovmskb %xmm0, %eax
122 lea 16(%esi), %esi
123
124 test %eax, %eax
125 jnz L(CopyFrom1To16Bytes)
126
127 movaps 16(%ecx, %esi), %xmm2
128 movaps %xmm1, (%edx, %esi)
129 pcmpeqd %xmm2, %xmm0
130 pmovmskb %xmm0, %eax
131 lea 16(%esi), %esi
132
133 test %eax, %eax
134 jnz L(CopyFrom1To16Bytes)
135
136 movaps 16(%ecx, %esi), %xmm3
137 movaps %xmm2, (%edx, %esi)
138 pcmpeqd %xmm3, %xmm0
139 pmovmskb %xmm0, %eax
140 lea 16(%esi), %esi
141
142 test %eax, %eax
143 jnz L(CopyFrom1To16Bytes)
144
145 movaps %xmm3, (%edx, %esi)
146 mov %ecx, %eax
147 lea 16(%ecx, %esi), %ecx
148 and $-0x40, %ecx
149 sub %ecx, %eax
150 sub %eax, %edx
151
152 mov $-0x40, %esi
153
154L(Aligned64Loop):
155 movaps (%ecx), %xmm2
156 movaps 32(%ecx), %xmm3
157 movaps %xmm2, %xmm4
158 movaps 16(%ecx), %xmm5
159 movaps %xmm3, %xmm6
160 movaps 48(%ecx), %xmm7
161 pminub %xmm5, %xmm2
162 pminub %xmm7, %xmm3
163 pminub %xmm2, %xmm3
164 lea 64(%edx), %edx
165 pcmpeqd %xmm0, %xmm3
166 lea 64(%ecx), %ecx
167 pmovmskb %xmm3, %eax
168
169 test %eax, %eax
170 jnz L(Aligned64Leave)
171 movaps %xmm4, -64(%edx)
172 movaps %xmm5, -48(%edx)
173 movaps %xmm6, -32(%edx)
174 movaps %xmm7, -16(%edx)
175 jmp L(Aligned64Loop)
176
177L(Aligned64Leave):
178 pcmpeqd %xmm4, %xmm0
179 pmovmskb %xmm0, %eax
180 test %eax, %eax
181 jnz L(CopyFrom1To16Bytes)
182
183 pcmpeqd %xmm5, %xmm0
184 pmovmskb %xmm0, %eax
185 movaps %xmm4, -64(%edx)
186 test %eax, %eax
187 lea 16(%esi), %esi
188 jnz L(CopyFrom1To16Bytes)
189
190 pcmpeqd %xmm6, %xmm0
191 pmovmskb %xmm0, %eax
192 movaps %xmm5, -48(%edx)
193 test %eax, %eax
194 lea 16(%esi), %esi
195 jnz L(CopyFrom1To16Bytes)
196
197 movaps %xmm6, -32(%edx)
198 pcmpeqd %xmm7, %xmm0
199 pmovmskb %xmm0, %eax
200 test %eax, %eax
201 lea 16(%esi), %esi
202 jnz L(CopyFrom1To16Bytes)
203
204 mov $-0x40, %esi
205 movaps %xmm7, -16(%edx)
206 jmp L(Aligned64Loop)
207
208 .p2align 4
209L(Shl4):
210 movaps -4(%ecx), %xmm1
211 movaps 12(%ecx), %xmm2
212L(Shl4Start):
213 pcmpeqd %xmm2, %xmm0
214 pmovmskb %xmm0, %eax
215 movaps %xmm2, %xmm3
216
217 test %eax, %eax
218 jnz L(Shl4LoopExit)
219
220 palignr $4, %xmm1, %xmm2
1d3e4b61
UD
221 movaps %xmm2, (%edx)
222 movaps 28(%ecx), %xmm2
223
224 pcmpeqd %xmm2, %xmm0
225 lea 16(%edx), %edx
226 pmovmskb %xmm0, %eax
227 lea 16(%ecx), %ecx
c044cf14 228 movaps %xmm2, %xmm1
1d3e4b61
UD
229
230 test %eax, %eax
231 jnz L(Shl4LoopExit)
232
c044cf14 233 palignr $4, %xmm3, %xmm2
1d3e4b61
UD
234 movaps %xmm2, (%edx)
235 movaps 28(%ecx), %xmm2
1d3e4b61
UD
236
237 pcmpeqd %xmm2, %xmm0
238 lea 16(%edx), %edx
239 pmovmskb %xmm0, %eax
240 lea 16(%ecx), %ecx
241 movaps %xmm2, %xmm3
242
243 test %eax, %eax
244 jnz L(Shl4LoopExit)
245
246 palignr $4, %xmm1, %xmm2
1d3e4b61
UD
247 movaps %xmm2, (%edx)
248 movaps 28(%ecx), %xmm2
249
250 pcmpeqd %xmm2, %xmm0
251 lea 16(%edx), %edx
252 pmovmskb %xmm0, %eax
253 lea 16(%ecx), %ecx
1d3e4b61
UD
254
255 test %eax, %eax
256 jnz L(Shl4LoopExit)
257
c044cf14 258 palignr $4, %xmm3, %xmm2
1d3e4b61
UD
259 movaps %xmm2, (%edx)
260 lea 28(%ecx), %ecx
261 lea 16(%edx), %edx
262
263 mov %ecx, %eax
264 and $-0x40, %ecx
265 sub %ecx, %eax
266 lea -12(%ecx), %ecx
267 sub %eax, %edx
268
269 movaps -4(%ecx), %xmm1
270
271L(Shl4LoopStart):
272 movaps 12(%ecx), %xmm2
273 movaps 28(%ecx), %xmm3
274 movaps %xmm3, %xmm6
275 movaps 44(%ecx), %xmm4
276 movaps %xmm4, %xmm7
277 movaps 60(%ecx), %xmm5
278 pminub %xmm2, %xmm6
279 pminub %xmm5, %xmm7
280 pminub %xmm6, %xmm7
281 pcmpeqd %xmm0, %xmm7
282 pmovmskb %xmm7, %eax
283 movaps %xmm5, %xmm7
284 palignr $4, %xmm4, %xmm5
285 test %eax, %eax
286 palignr $4, %xmm3, %xmm4
287 jnz L(Shl4Start)
288
289 palignr $4, %xmm2, %xmm3
290 lea 64(%ecx), %ecx
291 palignr $4, %xmm1, %xmm2
292 movaps %xmm7, %xmm1
293 movaps %xmm5, 48(%edx)
294 movaps %xmm4, 32(%edx)
295 movaps %xmm3, 16(%edx)
296 movaps %xmm2, (%edx)
297 lea 64(%edx), %edx
298 jmp L(Shl4LoopStart)
299
300L(Shl4LoopExit):
c044cf14
LD
301 movlpd (%ecx), %xmm0
302 movl 8(%ecx), %esi
303 movlpd %xmm0, (%edx)
304 movl %esi, 8(%edx)
305 POP (%esi)
1d3e4b61
UD
306 add $12, %edx
307 add $12, %ecx
1d3e4b61
UD
308 test %al, %al
309 jz L(ExitHigh)
310 test $0x01, %al
311 jnz L(Exit4)
312 movlpd (%ecx), %xmm0
313 movlpd %xmm0, (%edx)
314 movl %edi, %eax
315 RETURN
316
317 CFI_PUSH (%esi)
318
319 .p2align 4
320L(Shl8):
321 movaps -8(%ecx), %xmm1
322 movaps 8(%ecx), %xmm2
323L(Shl8Start):
324 pcmpeqd %xmm2, %xmm0
325 pmovmskb %xmm0, %eax
326 movaps %xmm2, %xmm3
327
328 test %eax, %eax
329 jnz L(Shl8LoopExit)
330
331 palignr $8, %xmm1, %xmm2
1d3e4b61
UD
332 movaps %xmm2, (%edx)
333 movaps 24(%ecx), %xmm2
334
335 pcmpeqd %xmm2, %xmm0
336 lea 16(%edx), %edx
337 pmovmskb %xmm0, %eax
338 lea 16(%ecx), %ecx
c044cf14 339 movaps %xmm2, %xmm1
1d3e4b61
UD
340
341 test %eax, %eax
342 jnz L(Shl8LoopExit)
343
c044cf14 344 palignr $8, %xmm3, %xmm2
1d3e4b61
UD
345 movaps %xmm2, (%edx)
346 movaps 24(%ecx), %xmm2
1d3e4b61
UD
347
348 pcmpeqd %xmm2, %xmm0
349 lea 16(%edx), %edx
350 pmovmskb %xmm0, %eax
351 lea 16(%ecx), %ecx
352 movaps %xmm2, %xmm3
353
354 test %eax, %eax
355 jnz L(Shl8LoopExit)
356
357 palignr $8, %xmm1, %xmm2
1d3e4b61
UD
358 movaps %xmm2, (%edx)
359 movaps 24(%ecx), %xmm2
360
361 pcmpeqd %xmm2, %xmm0
362 lea 16(%edx), %edx
363 pmovmskb %xmm0, %eax
364 lea 16(%ecx), %ecx
1d3e4b61
UD
365
366 test %eax, %eax
367 jnz L(Shl8LoopExit)
368
c044cf14 369 palignr $8, %xmm3, %xmm2
1d3e4b61
UD
370 movaps %xmm2, (%edx)
371 lea 24(%ecx), %ecx
372 lea 16(%edx), %edx
373
374 mov %ecx, %eax
375 and $-0x40, %ecx
376 sub %ecx, %eax
377 lea -8(%ecx), %ecx
378 sub %eax, %edx
379
380 movaps -8(%ecx), %xmm1
381
382L(Shl8LoopStart):
383 movaps 8(%ecx), %xmm2
384 movaps 24(%ecx), %xmm3
385 movaps %xmm3, %xmm6
386 movaps 40(%ecx), %xmm4
387 movaps %xmm4, %xmm7
388 movaps 56(%ecx), %xmm5
389 pminub %xmm2, %xmm6
390 pminub %xmm5, %xmm7
391 pminub %xmm6, %xmm7
392 pcmpeqd %xmm0, %xmm7
393 pmovmskb %xmm7, %eax
394 movaps %xmm5, %xmm7
395 palignr $8, %xmm4, %xmm5
396 test %eax, %eax
397 palignr $8, %xmm3, %xmm4
398 jnz L(Shl8Start)
399
400 palignr $8, %xmm2, %xmm3
401 lea 64(%ecx), %ecx
402 palignr $8, %xmm1, %xmm2
403 movaps %xmm7, %xmm1
404 movaps %xmm5, 48(%edx)
405 movaps %xmm4, 32(%edx)
406 movaps %xmm3, 16(%edx)
407 movaps %xmm2, (%edx)
408 lea 64(%edx), %edx
409 jmp L(Shl8LoopStart)
410
411L(Shl8LoopExit):
c044cf14
LD
412 movlpd (%ecx), %xmm0
413 movlpd %xmm0, (%edx)
414 POP (%esi)
1d3e4b61
UD
415 add $8, %edx
416 add $8, %ecx
1d3e4b61
UD
417 test %al, %al
418 jz L(ExitHigh)
419 test $0x01, %al
420 jnz L(Exit4)
421 movlpd (%ecx), %xmm0
422 movlpd %xmm0, (%edx)
423 movl %edi, %eax
424 RETURN
425
426 CFI_PUSH (%esi)
427
428 .p2align 4
429L(Shl12):
430 movaps -12(%ecx), %xmm1
431 movaps 4(%ecx), %xmm2
432L(Shl12Start):
433 pcmpeqd %xmm2, %xmm0
434 pmovmskb %xmm0, %eax
435 movaps %xmm2, %xmm3
436
437 test %eax, %eax
438 jnz L(Shl12LoopExit)
439
440 palignr $12, %xmm1, %xmm2
1d3e4b61
UD
441 movaps %xmm2, (%edx)
442 movaps 20(%ecx), %xmm2
443
444 pcmpeqd %xmm2, %xmm0
445 lea 16(%edx), %edx
446 pmovmskb %xmm0, %eax
447 lea 16(%ecx), %ecx
c044cf14 448 movaps %xmm2, %xmm1
1d3e4b61
UD
449
450 test %eax, %eax
451 jnz L(Shl12LoopExit)
452
c044cf14 453 palignr $12, %xmm3, %xmm2
1d3e4b61
UD
454 movaps %xmm2, (%edx)
455 movaps 20(%ecx), %xmm2
1d3e4b61
UD
456
457 pcmpeqd %xmm2, %xmm0
458 lea 16(%edx), %edx
459 pmovmskb %xmm0, %eax
460 lea 16(%ecx), %ecx
461 movaps %xmm2, %xmm3
462
463 test %eax, %eax
464 jnz L(Shl12LoopExit)
465
466 palignr $12, %xmm1, %xmm2
1d3e4b61
UD
467 movaps %xmm2, (%edx)
468 movaps 20(%ecx), %xmm2
469
470 pcmpeqd %xmm2, %xmm0
471 lea 16(%edx), %edx
472 pmovmskb %xmm0, %eax
473 lea 16(%ecx), %ecx
1d3e4b61
UD
474
475 test %eax, %eax
476 jnz L(Shl12LoopExit)
477
c044cf14 478 palignr $12, %xmm3, %xmm2
1d3e4b61
UD
479 movaps %xmm2, (%edx)
480 lea 20(%ecx), %ecx
481 lea 16(%edx), %edx
482
483 mov %ecx, %eax
484 and $-0x40, %ecx
485 sub %ecx, %eax
486 lea -4(%ecx), %ecx
487 sub %eax, %edx
488
489 movaps -12(%ecx), %xmm1
490
491L(Shl12LoopStart):
492 movaps 4(%ecx), %xmm2
493 movaps 20(%ecx), %xmm3
494 movaps %xmm3, %xmm6
495 movaps 36(%ecx), %xmm4
496 movaps %xmm4, %xmm7
497 movaps 52(%ecx), %xmm5
498 pminub %xmm2, %xmm6
499 pminub %xmm5, %xmm7
500 pminub %xmm6, %xmm7
501 pcmpeqd %xmm0, %xmm7
502 pmovmskb %xmm7, %eax
503 movaps %xmm5, %xmm7
504 palignr $12, %xmm4, %xmm5
505 test %eax, %eax
506 palignr $12, %xmm3, %xmm4
507 jnz L(Shl12Start)
508
509 palignr $12, %xmm2, %xmm3
510 lea 64(%ecx), %ecx
511 palignr $12, %xmm1, %xmm2
512 movaps %xmm7, %xmm1
513 movaps %xmm5, 48(%edx)
514 movaps %xmm4, 32(%edx)
515 movaps %xmm3, 16(%edx)
516 movaps %xmm2, (%edx)
517 lea 64(%edx), %edx
518 jmp L(Shl12LoopStart)
519
520L(Shl12LoopExit):
c044cf14
LD
521 movl (%ecx), %esi
522 movl %esi, (%edx)
1d3e4b61 523 mov $4, %esi
1d3e4b61
UD
524
525 .p2align 4
526L(CopyFrom1To16Bytes):
527 add %esi, %edx
528 add %esi, %ecx
529
530 POP (%esi)
531 test %al, %al
532 jz L(ExitHigh)
533 test $0x01, %al
534 jnz L(Exit4)
c044cf14 535L(Exit8):
1d3e4b61
UD
536 movlpd (%ecx), %xmm0
537 movlpd %xmm0, (%edx)
538 movl %edi, %eax
539 RETURN
540
541 .p2align 4
542L(ExitHigh):
543 test $0x01, %ah
544 jnz L(Exit12)
c044cf14 545L(Exit16):
1d3e4b61
UD
546 movdqu (%ecx), %xmm0
547 movdqu %xmm0, (%edx)
548 movl %edi, %eax
549 RETURN
550
551 .p2align 4
552L(Exit4):
553 movl (%ecx), %eax
554 movl %eax, (%edx)
555 movl %edi, %eax
556 RETURN
557
558 .p2align 4
559L(Exit12):
560 movlpd (%ecx), %xmm0
561 movlpd %xmm0, (%edx)
562 movl 8(%ecx), %eax
563 movl %eax, 8(%edx)
564 movl %edi, %eax
565 RETURN
566
567CFI_POP (%edi)
568
569 .p2align 4
570L(ExitTail4):
571 movl (%ecx), %eax
572 movl %eax, (%edx)
573 movl %edx, %eax
574 ret
575
576 .p2align 4
577L(ExitTail8):
578 movlpd (%ecx), %xmm0
579 movlpd %xmm0, (%edx)
580 movl %edx, %eax
581 ret
582
583 .p2align 4
584L(ExitTail12):
585 movlpd (%ecx), %xmm0
586 movlpd %xmm0, (%edx)
587 movl 8(%ecx), %eax
588 movl %eax, 8(%edx)
589 movl %edx, %eax
590 ret
591
592 .p2align 4
593L(ExitTail16):
594 movdqu (%ecx), %xmm0
595 movdqu %xmm0, (%edx)
596 movl %edx, %eax
597 ret
598
599END (__wcscpy_ssse3)
600#endif