2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 #include "asm-syntax.h"
27 # define MEMCMP __memcmp_ssse3
30 #define CFI_PUSH(REG) \
31 cfi_adjust_cfa_offset (4); \
32 cfi_rel_offset (REG, 0)
34 #define CFI_POP(REG) \
35 cfi_adjust_cfa_offset (-4); \
38 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
39 #define POP(REG) popl REG; CFI_POP (REG)
45 #define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
46 #define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
48 .section .text.ssse3,"ax",@progbits
108 jae L(next_unaligned_table)
126 L(next_unaligned_table):
150 pcmpeqb (%edi), %xmm1
151 movaps 16(%esi), %xmm2
152 pcmpeqb 16(%edi), %xmm2
160 lea (%ecx, %edi,1), %eax
161 lea (%ecx, %esi,1), %edx
173 pcmpeqb (%edi), %xmm0
175 movdqa 16(%esi), %xmm2
176 pcmpeqb 16(%edi), %xmm2
177 L(shr_0_gobble_loop):
182 movdqa 32(%esi), %xmm0
183 movdqa 48(%esi), %xmm2
185 pcmpeqb 32(%edi), %xmm0
186 pcmpeqb 48(%edi), %xmm2
189 jz L(shr_0_gobble_loop)
193 jge L(shr_0_gobble_loop_next)
196 L(shr_0_gobble_loop_next):
206 lea (%ecx, %edi,1), %eax
207 lea (%ecx, %esi,1), %edx
221 movdqa 16(%esi), %xmm1
223 palignr $1,(%esi), %xmm1
224 pcmpeqb (%edi), %xmm1
226 movdqa 32(%esi), %xmm3
227 palignr $1,%xmm2, %xmm3
228 pcmpeqb 16(%edi), %xmm3
236 lea (%ecx, %edi,1), %eax
237 lea 1(%ecx, %esi,1), %edx
247 movdqa 16(%esi), %xmm0
248 palignr $1,(%esi), %xmm0
249 pcmpeqb (%edi), %xmm0
251 movdqa 32(%esi), %xmm3
252 palignr $1,16(%esi), %xmm3
253 pcmpeqb 16(%edi), %xmm3
255 L(shr_1_gobble_loop):
261 movdqa 64(%esi), %xmm3
262 palignr $1,48(%esi), %xmm3
264 movdqa 48(%esi), %xmm0
265 palignr $1,32(%esi), %xmm0
266 pcmpeqb 32(%edi), %xmm0
268 pcmpeqb 48(%edi), %xmm3
271 jz L(shr_1_gobble_loop)
274 jge L(shr_1_gobble_next)
277 L(shr_1_gobble_next):
288 lea (%ecx, %edi,1), %eax
289 lea 1(%ecx, %esi,1), %edx
304 movdqa 16(%esi), %xmm1
306 palignr $2,(%esi), %xmm1
307 pcmpeqb (%edi), %xmm1
309 movdqa 32(%esi), %xmm3
310 palignr $2,%xmm2, %xmm3
311 pcmpeqb 16(%edi), %xmm3
319 lea (%ecx, %edi,1), %eax
320 lea 2(%ecx, %esi,1), %edx
330 movdqa 16(%esi), %xmm0
331 palignr $2,(%esi), %xmm0
332 pcmpeqb (%edi), %xmm0
334 movdqa 32(%esi), %xmm3
335 palignr $2,16(%esi), %xmm3
336 pcmpeqb 16(%edi), %xmm3
338 L(shr_2_gobble_loop):
344 movdqa 64(%esi), %xmm3
345 palignr $2,48(%esi), %xmm3
347 movdqa 48(%esi), %xmm0
348 palignr $2,32(%esi), %xmm0
349 pcmpeqb 32(%edi), %xmm0
351 pcmpeqb 48(%edi), %xmm3
354 jz L(shr_2_gobble_loop)
357 jge L(shr_2_gobble_next)
360 L(shr_2_gobble_next):
371 lea (%ecx, %edi,1), %eax
372 lea 2(%ecx, %esi,1), %edx
386 movdqa 16(%esi), %xmm1
388 palignr $3,(%esi), %xmm1
389 pcmpeqb (%edi), %xmm1
391 movdqa 32(%esi), %xmm3
392 palignr $3,%xmm2, %xmm3
393 pcmpeqb 16(%edi), %xmm3
401 lea (%ecx, %edi,1), %eax
402 lea 3(%ecx, %esi,1), %edx
412 movdqa 16(%esi), %xmm0
413 palignr $3,(%esi), %xmm0
414 pcmpeqb (%edi), %xmm0
416 movdqa 32(%esi), %xmm3
417 palignr $3,16(%esi), %xmm3
418 pcmpeqb 16(%edi), %xmm3
420 L(shr_3_gobble_loop):
426 movdqa 64(%esi), %xmm3
427 palignr $3,48(%esi), %xmm3
429 movdqa 48(%esi), %xmm0
430 palignr $3,32(%esi), %xmm0
431 pcmpeqb 32(%edi), %xmm0
433 pcmpeqb 48(%edi), %xmm3
436 jz L(shr_3_gobble_loop)
439 jge L(shr_3_gobble_next)
442 L(shr_3_gobble_next):
453 lea (%ecx, %edi,1), %eax
454 lea 3(%ecx, %esi,1), %edx
468 movdqa 16(%esi), %xmm1
470 palignr $4,(%esi), %xmm1
471 pcmpeqb (%edi), %xmm1
473 movdqa 32(%esi), %xmm3
474 palignr $4,%xmm2, %xmm3
475 pcmpeqb 16(%edi), %xmm3
483 lea (%ecx, %edi,1), %eax
484 lea 4(%ecx, %esi,1), %edx
494 movdqa 16(%esi), %xmm0
495 palignr $4,(%esi), %xmm0
496 pcmpeqb (%edi), %xmm0
498 movdqa 32(%esi), %xmm3
499 palignr $4,16(%esi), %xmm3
500 pcmpeqb 16(%edi), %xmm3
502 L(shr_4_gobble_loop):
508 movdqa 64(%esi), %xmm3
509 palignr $4,48(%esi), %xmm3
511 movdqa 48(%esi), %xmm0
512 palignr $4,32(%esi), %xmm0
513 pcmpeqb 32(%edi), %xmm0
515 pcmpeqb 48(%edi), %xmm3
518 jz L(shr_4_gobble_loop)
521 jge L(shr_4_gobble_next)
524 L(shr_4_gobble_next):
535 lea (%ecx, %edi,1), %eax
536 lea 4(%ecx, %esi,1), %edx
550 movdqa 16(%esi), %xmm1
552 palignr $5,(%esi), %xmm1
553 pcmpeqb (%edi), %xmm1
555 movdqa 32(%esi), %xmm3
556 palignr $5,%xmm2, %xmm3
557 pcmpeqb 16(%edi), %xmm3
565 lea (%ecx, %edi,1), %eax
566 lea 5(%ecx, %esi,1), %edx
576 movdqa 16(%esi), %xmm0
577 palignr $5,(%esi), %xmm0
578 pcmpeqb (%edi), %xmm0
580 movdqa 32(%esi), %xmm3
581 palignr $5,16(%esi), %xmm3
582 pcmpeqb 16(%edi), %xmm3
584 L(shr_5_gobble_loop):
590 movdqa 64(%esi), %xmm3
591 palignr $5,48(%esi), %xmm3
593 movdqa 48(%esi), %xmm0
594 palignr $5,32(%esi), %xmm0
595 pcmpeqb 32(%edi), %xmm0
597 pcmpeqb 48(%edi), %xmm3
600 jz L(shr_5_gobble_loop)
603 jge L(shr_5_gobble_next)
606 L(shr_5_gobble_next):
617 lea (%ecx, %edi,1), %eax
618 lea 5(%ecx, %esi,1), %edx
632 movdqa 16(%esi), %xmm1
634 palignr $6,(%esi), %xmm1
635 pcmpeqb (%edi), %xmm1
637 movdqa 32(%esi), %xmm3
638 palignr $6,%xmm2, %xmm3
639 pcmpeqb 16(%edi), %xmm3
647 lea (%ecx, %edi,1), %eax
648 lea 6(%ecx, %esi,1), %edx
658 movdqa 16(%esi), %xmm0
659 palignr $6,(%esi), %xmm0
660 pcmpeqb (%edi), %xmm0
662 movdqa 32(%esi), %xmm3
663 palignr $6,16(%esi), %xmm3
664 pcmpeqb 16(%edi), %xmm3
666 L(shr_6_gobble_loop):
672 movdqa 64(%esi), %xmm3
673 palignr $6,48(%esi), %xmm3
675 movdqa 48(%esi), %xmm0
676 palignr $6,32(%esi), %xmm0
677 pcmpeqb 32(%edi), %xmm0
679 pcmpeqb 48(%edi), %xmm3
682 jz L(shr_6_gobble_loop)
685 jge L(shr_6_gobble_next)
688 L(shr_6_gobble_next):
699 lea (%ecx, %edi,1), %eax
700 lea 6(%ecx, %esi,1), %edx
714 movdqa 16(%esi), %xmm1
716 palignr $7,(%esi), %xmm1
717 pcmpeqb (%edi), %xmm1
719 movdqa 32(%esi), %xmm3
720 palignr $7,%xmm2, %xmm3
721 pcmpeqb 16(%edi), %xmm3
729 lea (%ecx, %edi,1), %eax
730 lea 7(%ecx, %esi,1), %edx
740 movdqa 16(%esi), %xmm0
741 palignr $7,(%esi), %xmm0
742 pcmpeqb (%edi), %xmm0
744 movdqa 32(%esi), %xmm3
745 palignr $7,16(%esi), %xmm3
746 pcmpeqb 16(%edi), %xmm3
748 L(shr_7_gobble_loop):
754 movdqa 64(%esi), %xmm3
755 palignr $7,48(%esi), %xmm3
757 movdqa 48(%esi), %xmm0
758 palignr $7,32(%esi), %xmm0
759 pcmpeqb 32(%edi), %xmm0
761 pcmpeqb 48(%edi), %xmm3
764 jz L(shr_7_gobble_loop)
767 jge L(shr_7_gobble_next)
770 L(shr_7_gobble_next):
781 lea (%ecx, %edi,1), %eax
782 lea 7(%ecx, %esi,1), %edx
796 movdqa 16(%esi), %xmm1
798 palignr $8,(%esi), %xmm1
799 pcmpeqb (%edi), %xmm1
801 movdqa 32(%esi), %xmm3
802 palignr $8,%xmm2, %xmm3
803 pcmpeqb 16(%edi), %xmm3
811 lea (%ecx, %edi,1), %eax
812 lea 8(%ecx, %esi,1), %edx
822 movdqa 16(%esi), %xmm0
823 palignr $8,(%esi), %xmm0
824 pcmpeqb (%edi), %xmm0
826 movdqa 32(%esi), %xmm3
827 palignr $8,16(%esi), %xmm3
828 pcmpeqb 16(%edi), %xmm3
830 L(shr_8_gobble_loop):
836 movdqa 64(%esi), %xmm3
837 palignr $8,48(%esi), %xmm3
839 movdqa 48(%esi), %xmm0
840 palignr $8,32(%esi), %xmm0
841 pcmpeqb 32(%edi), %xmm0
843 pcmpeqb 48(%edi), %xmm3
846 jz L(shr_8_gobble_loop)
849 jge L(shr_8_gobble_next)
852 L(shr_8_gobble_next):
863 lea (%ecx, %edi,1), %eax
864 lea 8(%ecx, %esi,1), %edx
878 movdqa 16(%esi), %xmm1
880 palignr $9,(%esi), %xmm1
881 pcmpeqb (%edi), %xmm1
883 movdqa 32(%esi), %xmm3
884 palignr $9,%xmm2, %xmm3
885 pcmpeqb 16(%edi), %xmm3
893 lea (%ecx, %edi,1), %eax
894 lea 9(%ecx, %esi,1), %edx
904 movdqa 16(%esi), %xmm0
905 palignr $9,(%esi), %xmm0
906 pcmpeqb (%edi), %xmm0
908 movdqa 32(%esi), %xmm3
909 palignr $9,16(%esi), %xmm3
910 pcmpeqb 16(%edi), %xmm3
912 L(shr_9_gobble_loop):
918 movdqa 64(%esi), %xmm3
919 palignr $9,48(%esi), %xmm3
921 movdqa 48(%esi), %xmm0
922 palignr $9,32(%esi), %xmm0
923 pcmpeqb 32(%edi), %xmm0
925 pcmpeqb 48(%edi), %xmm3
928 jz L(shr_9_gobble_loop)
931 jge L(shr_9_gobble_next)
934 L(shr_9_gobble_next):
945 lea (%ecx, %edi,1), %eax
946 lea 9(%ecx, %esi,1), %edx
960 movdqa 16(%esi), %xmm1
962 palignr $10, (%esi), %xmm1
963 pcmpeqb (%edi), %xmm1
965 movdqa 32(%esi), %xmm3
966 palignr $10,%xmm2, %xmm3
967 pcmpeqb 16(%edi), %xmm3
975 lea (%ecx, %edi,1), %eax
976 lea 10(%ecx, %esi,1), %edx
986 movdqa 16(%esi), %xmm0
987 palignr $10, (%esi), %xmm0
988 pcmpeqb (%edi), %xmm0
990 movdqa 32(%esi), %xmm3
991 palignr $10, 16(%esi), %xmm3
992 pcmpeqb 16(%edi), %xmm3
994 L(shr_10_gobble_loop):
1000 movdqa 64(%esi), %xmm3
1001 palignr $10,48(%esi), %xmm3
1003 movdqa 48(%esi), %xmm0
1004 palignr $10,32(%esi), %xmm0
1005 pcmpeqb 32(%edi), %xmm0
1007 pcmpeqb 48(%edi), %xmm3
1010 jz L(shr_10_gobble_loop)
1013 jge L(shr_10_gobble_next)
1016 L(shr_10_gobble_next):
1020 pmovmskb %xmm3, %edx
1027 lea (%ecx, %edi,1), %eax
1028 lea 10(%ecx, %esi,1), %edx
1040 jae L(shr_11_gobble)
1042 movdqa 16(%esi), %xmm1
1044 palignr $11, (%esi), %xmm1
1045 pcmpeqb (%edi), %xmm1
1047 movdqa 32(%esi), %xmm3
1048 palignr $11, %xmm2, %xmm3
1049 pcmpeqb 16(%edi), %xmm3
1052 pmovmskb %xmm3, %edx
1057 lea (%ecx, %edi,1), %eax
1058 lea 11(%ecx, %esi,1), %edx
1068 movdqa 16(%esi), %xmm0
1069 palignr $11, (%esi), %xmm0
1070 pcmpeqb (%edi), %xmm0
1072 movdqa 32(%esi), %xmm3
1073 palignr $11, 16(%esi), %xmm3
1074 pcmpeqb 16(%edi), %xmm3
1076 L(shr_11_gobble_loop):
1079 pmovmskb %xmm3, %edx
1082 movdqa 64(%esi), %xmm3
1083 palignr $11,48(%esi), %xmm3
1085 movdqa 48(%esi), %xmm0
1086 palignr $11,32(%esi), %xmm0
1087 pcmpeqb 32(%edi), %xmm0
1089 pcmpeqb 48(%edi), %xmm3
1092 jz L(shr_11_gobble_loop)
1095 jge L(shr_11_gobble_next)
1098 L(shr_11_gobble_next):
1102 pmovmskb %xmm3, %edx
1109 lea (%ecx, %edi,1), %eax
1110 lea 11(%ecx, %esi,1), %edx
1122 jae L(shr_12_gobble)
1124 movdqa 16(%esi), %xmm1
1126 palignr $12, (%esi), %xmm1
1127 pcmpeqb (%edi), %xmm1
1129 movdqa 32(%esi), %xmm3
1130 palignr $12, %xmm2, %xmm3
1131 pcmpeqb 16(%edi), %xmm3
1134 pmovmskb %xmm3, %edx
1139 lea (%ecx, %edi,1), %eax
1140 lea 12(%ecx, %esi,1), %edx
1150 movdqa 16(%esi), %xmm0
1151 palignr $12, (%esi), %xmm0
1152 pcmpeqb (%edi), %xmm0
1154 movdqa 32(%esi), %xmm3
1155 palignr $12, 16(%esi), %xmm3
1156 pcmpeqb 16(%edi), %xmm3
1158 L(shr_12_gobble_loop):
1161 pmovmskb %xmm3, %edx
1164 movdqa 64(%esi), %xmm3
1165 palignr $12,48(%esi), %xmm3
1167 movdqa 48(%esi), %xmm0
1168 palignr $12,32(%esi), %xmm0
1169 pcmpeqb 32(%edi), %xmm0
1171 pcmpeqb 48(%edi), %xmm3
1174 jz L(shr_12_gobble_loop)
1177 jge L(shr_12_gobble_next)
1180 L(shr_12_gobble_next):
1184 pmovmskb %xmm3, %edx
1191 lea (%ecx, %edi,1), %eax
1192 lea 12(%ecx, %esi,1), %edx
1204 jae L(shr_13_gobble)
1206 movdqa 16(%esi), %xmm1
1208 palignr $13, (%esi), %xmm1
1209 pcmpeqb (%edi), %xmm1
1211 movdqa 32(%esi), %xmm3
1212 palignr $13, %xmm2, %xmm3
1213 pcmpeqb 16(%edi), %xmm3
1216 pmovmskb %xmm3, %edx
1221 lea (%ecx, %edi,1), %eax
1222 lea 13(%ecx, %esi,1), %edx
1232 movdqa 16(%esi), %xmm0
1233 palignr $13, (%esi), %xmm0
1234 pcmpeqb (%edi), %xmm0
1236 movdqa 32(%esi), %xmm3
1237 palignr $13, 16(%esi), %xmm3
1238 pcmpeqb 16(%edi), %xmm3
1240 L(shr_13_gobble_loop):
1243 pmovmskb %xmm3, %edx
1246 movdqa 64(%esi), %xmm3
1247 palignr $13,48(%esi), %xmm3
1249 movdqa 48(%esi), %xmm0
1250 palignr $13,32(%esi), %xmm0
1251 pcmpeqb 32(%edi), %xmm0
1253 pcmpeqb 48(%edi), %xmm3
1256 jz L(shr_13_gobble_loop)
1259 jge L(shr_13_gobble_next)
1262 L(shr_13_gobble_next):
1266 pmovmskb %xmm3, %edx
1273 lea (%ecx, %edi,1), %eax
1274 lea 13(%ecx, %esi,1), %edx
1286 jae L(shr_14_gobble)
1288 movdqa 16(%esi), %xmm1
1290 palignr $14, (%esi), %xmm1
1291 pcmpeqb (%edi), %xmm1
1293 movdqa 32(%esi), %xmm3
1294 palignr $14, %xmm2, %xmm3
1295 pcmpeqb 16(%edi), %xmm3
1298 pmovmskb %xmm3, %edx
1303 lea (%ecx, %edi,1), %eax
1304 lea 14(%ecx, %esi,1), %edx
1314 movdqa 16(%esi), %xmm0
1315 palignr $14, (%esi), %xmm0
1316 pcmpeqb (%edi), %xmm0
1318 movdqa 32(%esi), %xmm3
1319 palignr $14, 16(%esi), %xmm3
1320 pcmpeqb 16(%edi), %xmm3
1322 L(shr_14_gobble_loop):
1325 pmovmskb %xmm3, %edx
1328 movdqa 64(%esi), %xmm3
1329 palignr $14,48(%esi), %xmm3
1331 movdqa 48(%esi), %xmm0
1332 palignr $14,32(%esi), %xmm0
1333 pcmpeqb 32(%edi), %xmm0
1335 pcmpeqb 48(%edi), %xmm3
1338 jz L(shr_14_gobble_loop)
1341 jge L(shr_14_gobble_next)
1344 L(shr_14_gobble_next):
1348 pmovmskb %xmm3, %edx
1355 lea (%ecx, %edi,1), %eax
1356 lea 14(%ecx, %esi,1), %edx
1368 jae L(shr_15_gobble)
1370 movdqa 16(%esi), %xmm1
1372 palignr $15, (%esi), %xmm1
1373 pcmpeqb (%edi), %xmm1
1375 movdqa 32(%esi), %xmm3
1376 palignr $15, %xmm2, %xmm3
1377 pcmpeqb 16(%edi), %xmm3
1380 pmovmskb %xmm3, %edx
1385 lea (%ecx, %edi,1), %eax
1386 lea 15(%ecx, %esi,1), %edx
1396 movdqa 16(%esi), %xmm0
1397 palignr $15, (%esi), %xmm0
1398 pcmpeqb (%edi), %xmm0
1400 movdqa 32(%esi), %xmm3
1401 palignr $15, 16(%esi), %xmm3
1402 pcmpeqb 16(%edi), %xmm3
1404 L(shr_15_gobble_loop):
1407 pmovmskb %xmm3, %edx
1410 movdqa 64(%esi), %xmm3
1411 palignr $15,48(%esi), %xmm3
1413 movdqa 48(%esi), %xmm0
1414 palignr $15,32(%esi), %xmm0
1415 pcmpeqb 32(%edi), %xmm0
1417 pcmpeqb 48(%edi), %xmm3
1420 jz L(shr_15_gobble_loop)
1423 jge L(shr_15_gobble_next)
1426 L(shr_15_gobble_next):
1430 pmovmskb %xmm3, %edx
1437 lea (%ecx, %edi,1), %eax
1438 lea 15(%ecx, %esi,1), %edx
1447 pmovmskb %xmm1, %ebx
1480 movzbl -9(%edi), %eax
1481 movzbl -9(%esi), %edx
1487 movzbl -16(%edi), %eax
1488 movzbl -16(%esi), %edx
1494 movzbl -15(%edi), %eax
1495 movzbl -15(%esi), %edx
1501 movzbl -14(%edi), %eax
1502 movzbl -14(%esi), %edx
1508 movzbl -13(%edi), %eax
1509 movzbl -13(%esi), %edx
1515 movzbl -12(%edi), %eax
1516 movzbl -12(%esi), %edx
1522 movzbl -11(%edi), %eax
1523 movzbl -11(%esi), %edx
1529 movzbl -10(%edi), %eax
1530 movzbl -10(%esi), %edx
1561 movzbl -9(%edi), %eax
1562 movzbl -9(%esi), %edx
1798 movzbl -1(%eax), %ecx
1863 movzwl -2(%eax), %ecx
1864 movzwl -2(%edx), %ebx
1876 movl -47(%eax), %ecx
1877 movl -47(%edx), %ebx
1881 movl -43(%eax), %ecx
1882 movl -43(%edx), %ebx
1886 movl -39(%eax), %ecx
1887 movl -39(%edx), %ebx
1891 movl -35(%eax), %ecx
1892 movl -35(%edx), %ebx
1896 movl -31(%eax), %ecx
1897 movl -31(%edx), %ebx
1901 movl -27(%eax), %ecx
1902 movl -27(%edx), %ebx
1906 movl -23(%eax), %ecx
1907 movl -23(%edx), %ebx
1911 movl -19(%eax), %ecx
1912 movl -19(%edx), %ebx
1916 movl -15(%eax), %ecx
1917 movl -15(%edx), %ebx
1921 movl -11(%eax), %ecx
1922 movl -11(%edx), %ebx
1931 movzwl -3(%eax), %ecx
1932 movzwl -3(%edx), %ebx
1937 movzbl -1(%eax), %eax