]>
Commit | Line | Data |
---|---|---|
d9a4d2ab UD |
1 | /* strcmp with SSE4.2 |
2 | Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. | |
3 | Contributed by Intel Corporation. | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
d9a4d2ab UD |
19 | |
20 | ||
21 | /* We use 0x1a: | |
22 | _SIDD_SBYTE_OPS | |
23 | | _SIDD_CMP_EQUAL_EACH | |
24 | | _SIDD_NEGATIVE_POLARITY | |
25 | | _SIDD_LEAST_SIGNIFICANT | |
26 | on pcmpistri to find out if two 16byte data elements are the same | |
27 | and the offset of the first different byte. There are 4 cases: | |
28 | ||
29 | 1. Both 16byte data elements are valid and identical. | |
30 | 2. Both 16byte data elements have EOS and identical. | |
31 | 3. Both 16byte data elements are valid and they differ at offset X. | |
32 | 4. At least one 16byte data element has EOS at offset X. Two 16byte | |
33 | data elements must differ at or before offset X. | |
34 | ||
35 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: | |
36 | ||
37 | case ECX CFlag ZFlag SFlag | |
38 | 1 16 0 0 0 | |
39 | 2 16 0 1 1 | |
40 | 3 X 1 0 0 | |
41 | 4 0 <= X 1 0/1 0/1 | |
42 | ||
43 | We exit from the loop for cases 2, 3 and 4 with jbe which branches | |
44 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for | |
45 | case 2. */ | |
46 | ||
47 | /* Put all SSE 4.2 functions together. */ | |
48 | .section .text.SECTION,"ax",@progbits | |
49 | .align 16 | |
50 | .type STRCMP_SSE42, @function | |
51 | #ifdef USE_AS_STRCASECMP_L | |
52 | ENTRY (GLABEL(__strcasecmp)) | |
53 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
54 | movq %fs:(%rax),%rdx | |
55 | ||
56 | // XXX 5 byte should be before the function | |
57 | /* 5-byte NOP. */ | |
58 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
59 | END (GLABEL(__strcasecmp)) | |
60 | /* FALLTHROUGH to strcasecmp_l. */ | |
61 | #endif | |
62 | #ifdef USE_AS_STRNCASECMP_L | |
63 | ENTRY (GLABEL(__strncasecmp)) | |
64 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
65 | movq %fs:(%rax),%rcx | |
66 | ||
67 | // XXX 5 byte should be before the function | |
68 | /* 5-byte NOP. */ | |
69 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
70 | END (GLABEL(__strncasecmp)) | |
71 | /* FALLTHROUGH to strncasecmp_l. */ | |
72 | #endif | |
73 | ||
618280a1 UD |
74 | |
75 | #ifdef USE_AVX | |
76 | # define movdqa vmovdqa | |
77 | # define movdqu vmovdqu | |
78 | # define pmovmskb vpmovmskb | |
79 | # define pcmpistri vpcmpistri | |
80 | # define psubb vpsubb | |
81 | # define pcmpeqb vpcmpeqb | |
82 | # define psrldq vpsrldq | |
83 | # define pslldq vpslldq | |
84 | # define palignr vpalignr | |
85 | # define pxor vpxor | |
86 | # define D(arg) arg, arg | |
87 | #else | |
88 | # define D(arg) arg | |
89 | #endif | |
90 | ||
d9a4d2ab UD |
91 | STRCMP_SSE42: |
92 | cfi_startproc | |
93 | CALL_MCOUNT | |
94 | ||
95 | /* | |
96 | * This implementation uses SSE to compare up to 16 bytes at a time. | |
97 | */ | |
98 | #ifdef USE_AS_STRCASECMP_L | |
99 | /* We have to fall back on the C implementation for locales | |
100 | with encodings not matching ASCII for single bytes. */ | |
101 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
102 | movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax | |
103 | # else | |
104 | movq (%rdx), %rax | |
105 | # endif | |
34372fc6 | 106 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
d9a4d2ab UD |
107 | jne __strcasecmp_l_nonascii |
108 | #endif | |
109 | #ifdef USE_AS_STRNCASECMP_L | |
110 | /* We have to fall back on the C implementation for locales | |
111 | with encodings not matching ASCII for single bytes. */ | |
112 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
113 | movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax | |
114 | # else | |
115 | movq (%rcx), %rax | |
116 | # endif | |
34372fc6 | 117 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
d9a4d2ab UD |
118 | jne __strncasecmp_l_nonascii |
119 | #endif | |
120 | ||
121 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
122 | test %rdx, %rdx | |
123 | je LABEL(strcmp_exitz) | |
124 | cmp $1, %rdx | |
125 | je LABEL(Byte0) | |
126 | mov %rdx, %r11 | |
127 | #endif | |
128 | mov %esi, %ecx | |
129 | mov %edi, %eax | |
130 | /* Use 64bit AND here to avoid long NOP padding. */ | |
131 | and $0x3f, %rcx /* rsi alignment in cache line */ | |
132 | and $0x3f, %rax /* rdi alignment in cache line */ | |
133 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
134 | .section .rodata.cst16,"aM",@progbits,16 | |
135 | .align 16 | |
136 | LABEL(belowupper): | |
137 | .quad 0x4040404040404040 | |
138 | .quad 0x4040404040404040 | |
139 | LABEL(topupper): | |
140 | # ifdef USE_AVX | |
141 | .quad 0x5a5a5a5a5a5a5a5a | |
142 | .quad 0x5a5a5a5a5a5a5a5a | |
143 | # else | |
144 | .quad 0x5b5b5b5b5b5b5b5b | |
145 | .quad 0x5b5b5b5b5b5b5b5b | |
146 | # endif | |
147 | LABEL(touppermask): | |
148 | .quad 0x2020202020202020 | |
149 | .quad 0x2020202020202020 | |
150 | .previous | |
151 | movdqa LABEL(belowupper)(%rip), %xmm4 | |
152 | # define UCLOW_reg %xmm4 | |
153 | movdqa LABEL(topupper)(%rip), %xmm5 | |
154 | # define UCHIGH_reg %xmm5 | |
155 | movdqa LABEL(touppermask)(%rip), %xmm6 | |
156 | # define LCQWORD_reg %xmm6 | |
157 | #endif | |
158 | cmp $0x30, %ecx | |
159 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ | |
160 | cmp $0x30, %eax | |
161 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ | |
162 | movdqu (%rdi), %xmm1 | |
163 | movdqu (%rsi), %xmm2 | |
164 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
165 | # ifdef USE_AVX | |
166 | # define TOLOWER(reg1, reg2) \ | |
167 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ | |
168 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ | |
169 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ | |
170 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ | |
171 | vpandn %xmm7, %xmm8, %xmm8; \ | |
172 | vpandn %xmm9, %xmm10, %xmm10; \ | |
173 | vpand LCQWORD_reg, %xmm8, %xmm8; \ | |
174 | vpand LCQWORD_reg, %xmm10, %xmm10; \ | |
175 | vpor reg1, %xmm8, reg1; \ | |
176 | vpor reg2, %xmm10, reg2 | |
177 | # else | |
178 | # define TOLOWER(reg1, reg2) \ | |
179 | movdqa reg1, %xmm7; \ | |
180 | movdqa UCHIGH_reg, %xmm8; \ | |
181 | movdqa reg2, %xmm9; \ | |
182 | movdqa UCHIGH_reg, %xmm10; \ | |
183 | pcmpgtb UCLOW_reg, %xmm7; \ | |
184 | pcmpgtb reg1, %xmm8; \ | |
185 | pcmpgtb UCLOW_reg, %xmm9; \ | |
186 | pcmpgtb reg2, %xmm10; \ | |
187 | pand %xmm8, %xmm7; \ | |
188 | pand %xmm10, %xmm9; \ | |
189 | pand LCQWORD_reg, %xmm7; \ | |
190 | pand LCQWORD_reg, %xmm9; \ | |
191 | por %xmm7, reg1; \ | |
192 | por %xmm9, reg2 | |
193 | # endif | |
194 | TOLOWER (%xmm1, %xmm2) | |
195 | #else | |
196 | # define TOLOWER(reg1, reg2) | |
197 | #endif | |
618280a1 UD |
198 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ |
199 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ | |
200 | pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ | |
201 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ | |
d9a4d2ab UD |
202 | pmovmskb %xmm1, %edx |
203 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ | |
204 | jnz LABEL(less16bytes)/* If not, find different value or null char */ | |
205 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
206 | sub $16, %r11 | |
207 | jbe LABEL(strcmp_exitz)/* finish comparision */ | |
208 | #endif | |
209 | add $16, %rsi /* prepare to search next 16 bytes */ | |
210 | add $16, %rdi /* prepare to search next 16 bytes */ | |
211 | ||
212 | /* | |
213 | * Determine source and destination string offsets from 16-byte | |
214 | * alignment. Use relative offset difference between the two to | |
215 | * determine which case below to use. | |
216 | */ | |
217 | .p2align 4 | |
218 | LABEL(crosscache): | |
219 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ | |
220 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ | |
221 | mov $0xffff, %edx /* for equivalent offset */ | |
222 | xor %r8d, %r8d | |
223 | and $0xf, %ecx /* offset of rsi */ | |
224 | and $0xf, %eax /* offset of rdi */ | |
618280a1 | 225 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ |
d9a4d2ab UD |
226 | cmp %eax, %ecx |
227 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ | |
228 | ja LABEL(bigger) | |
229 | mov %edx, %r8d /* r8d is offset flag for exit tail */ | |
230 | xchg %ecx, %eax | |
231 | xchg %rsi, %rdi | |
232 | LABEL(bigger): | |
618280a1 UD |
233 | movdqa (%rdi), %xmm2 |
234 | movdqa (%rsi), %xmm1 | |
d9a4d2ab UD |
235 | lea 15(%rax), %r9 |
236 | sub %rcx, %r9 | |
237 | lea LABEL(unaligned_table)(%rip), %r10 | |
238 | movslq (%r10, %r9,4), %r9 | |
618280a1 | 239 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
d9a4d2ab UD |
240 | lea (%r10, %r9), %r10 |
241 | jmp *%r10 /* jump to corresponding case */ | |
242 | ||
243 | /* | |
244 | * The following cases will be handled by ashr_0 | |
245 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
246 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 | |
247 | */ | |
248 | .p2align 4 | |
249 | LABEL(ashr_0): | |
250 | ||
251 | movdqa (%rsi), %xmm1 | |
618280a1 | 252 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
d9a4d2ab | 253 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
618280a1 | 254 | pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ |
d9a4d2ab UD |
255 | #else |
256 | movdqa (%rdi), %xmm2 | |
257 | TOLOWER (%xmm1, %xmm2) | |
618280a1 | 258 | pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ |
d9a4d2ab | 259 | #endif |
618280a1 | 260 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
d9a4d2ab UD |
261 | pmovmskb %xmm1, %r9d |
262 | shr %cl, %edx /* adjust 0xffff for offset */ | |
263 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
264 | sub %r9d, %edx | |
265 | /* | |
266 | * edx must be the same with r9d if in left byte (16-rcx) is equal to | |
267 | * the start from (16-rax) and no null char was seen. | |
268 | */ | |
269 | jne LABEL(less32bytes) /* mismatch or null char */ | |
270 | UPDATE_STRNCMP_COUNTER | |
271 | mov $16, %rcx | |
272 | mov $16, %r9 | |
d9a4d2ab UD |
273 | |
274 | /* | |
275 | * Now both strings are aligned at 16-byte boundary. Loop over strings | |
276 | * checking 32-bytes per iteration. | |
277 | */ | |
278 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
279 | .p2align 4 | |
280 | LABEL(ashr_0_use): | |
281 | movdqa (%rdi,%rdx), %xmm0 | |
282 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
283 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
284 | #else | |
285 | movdqa (%rsi,%rdx), %xmm1 | |
286 | TOLOWER (%xmm0, %xmm1) | |
287 | pcmpistri $0x1a, %xmm1, %xmm0 | |
288 | #endif | |
289 | lea 16(%rdx), %rdx | |
290 | jbe LABEL(ashr_0_exit_use) | |
291 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
292 | sub $16, %r11 | |
293 | jbe LABEL(strcmp_exitz) | |
294 | #endif | |
295 | ||
296 | movdqa (%rdi,%rdx), %xmm0 | |
297 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
298 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
299 | #else | |
300 | movdqa (%rsi,%rdx), %xmm1 | |
301 | TOLOWER (%xmm0, %xmm1) | |
302 | pcmpistri $0x1a, %xmm1, %xmm0 | |
303 | #endif | |
304 | lea 16(%rdx), %rdx | |
305 | jbe LABEL(ashr_0_exit_use) | |
306 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
307 | sub $16, %r11 | |
308 | jbe LABEL(strcmp_exitz) | |
309 | #endif | |
310 | jmp LABEL(ashr_0_use) | |
311 | ||
312 | ||
313 | .p2align 4 | |
314 | LABEL(ashr_0_exit_use): | |
315 | jnc LABEL(strcmp_exitz) | |
316 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
317 | sub %rcx, %r11 | |
318 | jbe LABEL(strcmp_exitz) | |
319 | #endif | |
320 | lea -16(%rdx, %rcx), %rcx | |
321 | movzbl (%rdi, %rcx), %eax | |
322 | movzbl (%rsi, %rcx), %edx | |
323 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
324 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
325 | movl (%rcx,%rax,4), %eax | |
326 | movl (%rcx,%rdx,4), %edx | |
327 | #endif | |
328 | sub %edx, %eax | |
329 | ret | |
330 | ||
331 | ||
332 | ||
333 | /* | |
334 | * The following cases will be handled by ashr_1 | |
335 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
336 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 | |
337 | */ | |
338 | .p2align 4 | |
339 | LABEL(ashr_1): | |
618280a1 | 340 | pslldq $15, D(%xmm2) /* shift first string to align with second */ |
d9a4d2ab | 341 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
342 | pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ |
343 | psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ | |
d9a4d2ab UD |
344 | pmovmskb %xmm2, %r9d |
345 | shr %cl, %edx /* adjust 0xffff for offset */ | |
346 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
347 | sub %r9d, %edx | |
348 | jnz LABEL(less32bytes) /* mismatch or null char seen */ | |
349 | movdqa (%rdi), %xmm3 | |
350 | UPDATE_STRNCMP_COUNTER | |
351 | ||
d9a4d2ab UD |
352 | mov $16, %rcx /* index for loads*/ |
353 | mov $1, %r9d /* byte position left over from less32bytes case */ | |
354 | /* | |
355 | * Setup %r10 value allows us to detect crossing a page boundary. | |
356 | * When %r10 goes positive we have crossed a page boundary and | |
357 | * need to do a nibble. | |
358 | */ | |
359 | lea 1(%rdi), %r10 | |
360 | and $0xfff, %r10 /* offset into 4K page */ | |
361 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
362 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
363 | ||
364 | .p2align 4 | |
365 | LABEL(loop_ashr_1_use): | |
366 | add $16, %r10 | |
367 | jg LABEL(nibble_ashr_1_use) | |
368 | ||
369 | LABEL(nibble_ashr_1_restart_use): | |
370 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 371 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
372 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
373 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
374 | #else | |
375 | movdqa (%rsi,%rdx), %xmm1 | |
376 | TOLOWER (%xmm0, %xmm1) | |
377 | pcmpistri $0x1a, %xmm1, %xmm0 | |
378 | #endif | |
379 | jbe LABEL(exit_use) | |
380 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
381 | sub $16, %r11 | |
382 | jbe LABEL(strcmp_exitz) | |
383 | #endif | |
384 | ||
385 | add $16, %rdx | |
386 | add $16, %r10 | |
387 | jg LABEL(nibble_ashr_1_use) | |
388 | ||
389 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 390 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
391 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
392 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
393 | #else | |
394 | movdqa (%rsi,%rdx), %xmm1 | |
395 | TOLOWER (%xmm0, %xmm1) | |
396 | pcmpistri $0x1a, %xmm1, %xmm0 | |
397 | #endif | |
398 | jbe LABEL(exit_use) | |
399 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
400 | sub $16, %r11 | |
401 | jbe LABEL(strcmp_exitz) | |
402 | #endif | |
403 | add $16, %rdx | |
404 | jmp LABEL(loop_ashr_1_use) | |
405 | ||
406 | .p2align 4 | |
407 | LABEL(nibble_ashr_1_use): | |
408 | sub $0x1000, %r10 | |
409 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 410 | psrldq $1, D(%xmm0) |
d9a4d2ab UD |
411 | pcmpistri $0x3a,%xmm0, %xmm0 |
412 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
413 | cmp %r11, %rcx | |
414 | jae LABEL(nibble_ashr_exit_use) | |
415 | #endif | |
416 | cmp $14, %ecx | |
417 | ja LABEL(nibble_ashr_1_restart_use) | |
418 | ||
419 | jmp LABEL(nibble_ashr_exit_use) | |
420 | ||
421 | /* | |
422 | * The following cases will be handled by ashr_2 | |
423 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
424 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 | |
425 | */ | |
426 | .p2align 4 | |
427 | LABEL(ashr_2): | |
618280a1 | 428 | pslldq $14, D(%xmm2) |
d9a4d2ab | 429 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
430 | pcmpeqb %xmm1, D(%xmm2) |
431 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
432 | pmovmskb %xmm2, %r9d |
433 | shr %cl, %edx | |
434 | shr %cl, %r9d | |
435 | sub %r9d, %edx | |
436 | jnz LABEL(less32bytes) | |
437 | movdqa (%rdi), %xmm3 | |
438 | UPDATE_STRNCMP_COUNTER | |
439 | ||
d9a4d2ab UD |
440 | mov $16, %rcx /* index for loads */ |
441 | mov $2, %r9d /* byte position left over from less32bytes case */ | |
442 | /* | |
443 | * Setup %r10 value allows us to detect crossing a page boundary. | |
444 | * When %r10 goes positive we have crossed a page boundary and | |
445 | * need to do a nibble. | |
446 | */ | |
447 | lea 2(%rdi), %r10 | |
448 | and $0xfff, %r10 /* offset into 4K page */ | |
449 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
450 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
451 | ||
452 | .p2align 4 | |
453 | LABEL(loop_ashr_2_use): | |
454 | add $16, %r10 | |
455 | jg LABEL(nibble_ashr_2_use) | |
456 | ||
457 | LABEL(nibble_ashr_2_restart_use): | |
458 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 459 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
460 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
461 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
462 | #else | |
463 | movdqa (%rsi,%rdx), %xmm1 | |
464 | TOLOWER (%xmm0, %xmm1) | |
465 | pcmpistri $0x1a, %xmm1, %xmm0 | |
466 | #endif | |
467 | jbe LABEL(exit_use) | |
468 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
469 | sub $16, %r11 | |
470 | jbe LABEL(strcmp_exitz) | |
471 | #endif | |
472 | ||
473 | add $16, %rdx | |
474 | add $16, %r10 | |
475 | jg LABEL(nibble_ashr_2_use) | |
476 | ||
477 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 478 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
479 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
480 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
481 | #else | |
482 | movdqa (%rsi,%rdx), %xmm1 | |
483 | TOLOWER (%xmm0, %xmm1) | |
484 | pcmpistri $0x1a, %xmm1, %xmm0 | |
485 | #endif | |
486 | jbe LABEL(exit_use) | |
487 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
488 | sub $16, %r11 | |
489 | jbe LABEL(strcmp_exitz) | |
490 | #endif | |
491 | add $16, %rdx | |
492 | jmp LABEL(loop_ashr_2_use) | |
493 | ||
494 | .p2align 4 | |
495 | LABEL(nibble_ashr_2_use): | |
496 | sub $0x1000, %r10 | |
497 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 498 | psrldq $2, D(%xmm0) |
d9a4d2ab UD |
499 | pcmpistri $0x3a,%xmm0, %xmm0 |
500 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
501 | cmp %r11, %rcx | |
502 | jae LABEL(nibble_ashr_exit_use) | |
503 | #endif | |
504 | cmp $13, %ecx | |
505 | ja LABEL(nibble_ashr_2_restart_use) | |
506 | ||
507 | jmp LABEL(nibble_ashr_exit_use) | |
508 | ||
509 | /* | |
510 | * The following cases will be handled by ashr_3 | |
511 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
512 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 | |
513 | */ | |
514 | .p2align 4 | |
515 | LABEL(ashr_3): | |
618280a1 | 516 | pslldq $13, D(%xmm2) |
d9a4d2ab | 517 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
518 | pcmpeqb %xmm1, D(%xmm2) |
519 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
520 | pmovmskb %xmm2, %r9d |
521 | shr %cl, %edx | |
522 | shr %cl, %r9d | |
523 | sub %r9d, %edx | |
524 | jnz LABEL(less32bytes) | |
525 | movdqa (%rdi), %xmm3 | |
526 | ||
527 | UPDATE_STRNCMP_COUNTER | |
528 | ||
d9a4d2ab UD |
529 | mov $16, %rcx /* index for loads */ |
530 | mov $3, %r9d /* byte position left over from less32bytes case */ | |
531 | /* | |
532 | * Setup %r10 value allows us to detect crossing a page boundary. | |
533 | * When %r10 goes positive we have crossed a page boundary and | |
534 | * need to do a nibble. | |
535 | */ | |
536 | lea 3(%rdi), %r10 | |
537 | and $0xfff, %r10 /* offset into 4K page */ | |
538 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
539 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
540 | ||
541 | LABEL(loop_ashr_3_use): | |
542 | add $16, %r10 | |
543 | jg LABEL(nibble_ashr_3_use) | |
544 | ||
545 | LABEL(nibble_ashr_3_restart_use): | |
546 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 547 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
548 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
549 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
550 | #else | |
551 | movdqa (%rsi,%rdx), %xmm1 | |
552 | TOLOWER (%xmm0, %xmm1) | |
553 | pcmpistri $0x1a, %xmm1, %xmm0 | |
554 | #endif | |
555 | jbe LABEL(exit_use) | |
556 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
557 | sub $16, %r11 | |
558 | jbe LABEL(strcmp_exitz) | |
559 | #endif | |
560 | ||
561 | add $16, %rdx | |
562 | add $16, %r10 | |
563 | jg LABEL(nibble_ashr_3_use) | |
564 | ||
565 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 566 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
567 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
568 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
569 | #else | |
570 | movdqa (%rsi,%rdx), %xmm1 | |
571 | TOLOWER (%xmm0, %xmm1) | |
572 | pcmpistri $0x1a, %xmm1, %xmm0 | |
573 | #endif | |
574 | jbe LABEL(exit_use) | |
575 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
576 | sub $16, %r11 | |
577 | jbe LABEL(strcmp_exitz) | |
578 | #endif | |
579 | add $16, %rdx | |
580 | jmp LABEL(loop_ashr_3_use) | |
581 | ||
582 | .p2align 4 | |
583 | LABEL(nibble_ashr_3_use): | |
584 | sub $0x1000, %r10 | |
585 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 586 | psrldq $3, D(%xmm0) |
d9a4d2ab UD |
587 | pcmpistri $0x3a,%xmm0, %xmm0 |
588 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
589 | cmp %r11, %rcx | |
590 | jae LABEL(nibble_ashr_exit_use) | |
591 | #endif | |
592 | cmp $12, %ecx | |
593 | ja LABEL(nibble_ashr_3_restart_use) | |
594 | ||
595 | jmp LABEL(nibble_ashr_exit_use) | |
596 | ||
597 | /* | |
598 | * The following cases will be handled by ashr_4 | |
599 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
600 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 | |
601 | */ | |
602 | .p2align 4 | |
603 | LABEL(ashr_4): | |
618280a1 | 604 | pslldq $12, D(%xmm2) |
d9a4d2ab | 605 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
606 | pcmpeqb %xmm1, D(%xmm2) |
607 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
608 | pmovmskb %xmm2, %r9d |
609 | shr %cl, %edx | |
610 | shr %cl, %r9d | |
611 | sub %r9d, %edx | |
612 | jnz LABEL(less32bytes) | |
613 | movdqa (%rdi), %xmm3 | |
614 | ||
615 | UPDATE_STRNCMP_COUNTER | |
616 | ||
d9a4d2ab UD |
617 | mov $16, %rcx /* index for loads */ |
618 | mov $4, %r9d /* byte position left over from less32bytes case */ | |
619 | /* | |
620 | * Setup %r10 value allows us to detect crossing a page boundary. | |
621 | * When %r10 goes positive we have crossed a page boundary and | |
622 | * need to do a nibble. | |
623 | */ | |
624 | lea 4(%rdi), %r10 | |
625 | and $0xfff, %r10 /* offset into 4K page */ | |
626 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
627 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
628 | ||
629 | .p2align 4 | |
630 | LABEL(loop_ashr_4_use): | |
631 | add $16, %r10 | |
632 | jg LABEL(nibble_ashr_4_use) | |
633 | ||
634 | LABEL(nibble_ashr_4_restart_use): | |
635 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 636 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
637 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
638 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
639 | #else | |
640 | movdqa (%rsi,%rdx), %xmm1 | |
641 | TOLOWER (%xmm0, %xmm1) | |
642 | pcmpistri $0x1a, %xmm1, %xmm0 | |
643 | #endif | |
644 | jbe LABEL(exit_use) | |
645 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
646 | sub $16, %r11 | |
647 | jbe LABEL(strcmp_exitz) | |
648 | #endif | |
649 | ||
650 | add $16, %rdx | |
651 | add $16, %r10 | |
652 | jg LABEL(nibble_ashr_4_use) | |
653 | ||
654 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 655 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
656 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
657 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
658 | #else | |
659 | movdqa (%rsi,%rdx), %xmm1 | |
660 | TOLOWER (%xmm0, %xmm1) | |
661 | pcmpistri $0x1a, %xmm1, %xmm0 | |
662 | #endif | |
663 | jbe LABEL(exit_use) | |
664 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
665 | sub $16, %r11 | |
666 | jbe LABEL(strcmp_exitz) | |
667 | #endif | |
668 | add $16, %rdx | |
669 | jmp LABEL(loop_ashr_4_use) | |
670 | ||
671 | .p2align 4 | |
672 | LABEL(nibble_ashr_4_use): | |
673 | sub $0x1000, %r10 | |
674 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 675 | psrldq $4, D(%xmm0) |
d9a4d2ab UD |
676 | pcmpistri $0x3a,%xmm0, %xmm0 |
677 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
678 | cmp %r11, %rcx | |
679 | jae LABEL(nibble_ashr_exit_use) | |
680 | #endif | |
681 | cmp $11, %ecx | |
682 | ja LABEL(nibble_ashr_4_restart_use) | |
683 | ||
684 | jmp LABEL(nibble_ashr_exit_use) | |
685 | ||
686 | /* | |
687 | * The following cases will be handled by ashr_5 | |
688 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
689 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 | |
690 | */ | |
691 | .p2align 4 | |
692 | LABEL(ashr_5): | |
618280a1 | 693 | pslldq $11, D(%xmm2) |
d9a4d2ab | 694 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
695 | pcmpeqb %xmm1, D(%xmm2) |
696 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
697 | pmovmskb %xmm2, %r9d |
698 | shr %cl, %edx | |
699 | shr %cl, %r9d | |
700 | sub %r9d, %edx | |
701 | jnz LABEL(less32bytes) | |
702 | movdqa (%rdi), %xmm3 | |
703 | ||
704 | UPDATE_STRNCMP_COUNTER | |
705 | ||
d9a4d2ab UD |
706 | mov $16, %rcx /* index for loads */ |
707 | mov $5, %r9d /* byte position left over from less32bytes case */ | |
708 | /* | |
709 | * Setup %r10 value allows us to detect crossing a page boundary. | |
710 | * When %r10 goes positive we have crossed a page boundary and | |
711 | * need to do a nibble. | |
712 | */ | |
713 | lea 5(%rdi), %r10 | |
714 | and $0xfff, %r10 /* offset into 4K page */ | |
715 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
716 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
717 | ||
718 | .p2align 4 | |
719 | LABEL(loop_ashr_5_use): | |
720 | add $16, %r10 | |
721 | jg LABEL(nibble_ashr_5_use) | |
722 | ||
723 | LABEL(nibble_ashr_5_restart_use): | |
724 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 725 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
726 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
727 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
728 | #else | |
729 | movdqa (%rsi,%rdx), %xmm1 | |
730 | TOLOWER (%xmm0, %xmm1) | |
731 | pcmpistri $0x1a, %xmm1, %xmm0 | |
732 | #endif | |
733 | jbe LABEL(exit_use) | |
734 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
735 | sub $16, %r11 | |
736 | jbe LABEL(strcmp_exitz) | |
737 | #endif | |
738 | ||
739 | add $16, %rdx | |
740 | add $16, %r10 | |
741 | jg LABEL(nibble_ashr_5_use) | |
742 | ||
743 | movdqa (%rdi, %rdx), %xmm0 | |
744 | ||
618280a1 | 745 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
746 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
747 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
748 | #else | |
749 | movdqa (%rsi,%rdx), %xmm1 | |
750 | TOLOWER (%xmm0, %xmm1) | |
751 | pcmpistri $0x1a, %xmm1, %xmm0 | |
752 | #endif | |
753 | jbe LABEL(exit_use) | |
754 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
755 | sub $16, %r11 | |
756 | jbe LABEL(strcmp_exitz) | |
757 | #endif | |
758 | add $16, %rdx | |
759 | jmp LABEL(loop_ashr_5_use) | |
760 | ||
761 | .p2align 4 | |
762 | LABEL(nibble_ashr_5_use): | |
763 | sub $0x1000, %r10 | |
764 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 765 | psrldq $5, D(%xmm0) |
d9a4d2ab UD |
766 | pcmpistri $0x3a,%xmm0, %xmm0 |
767 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
768 | cmp %r11, %rcx | |
769 | jae LABEL(nibble_ashr_exit_use) | |
770 | #endif | |
771 | cmp $10, %ecx | |
772 | ja LABEL(nibble_ashr_5_restart_use) | |
773 | ||
774 | jmp LABEL(nibble_ashr_exit_use) | |
775 | ||
776 | /* | |
777 | * The following cases will be handled by ashr_6 | |
778 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
779 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 | |
780 | */ | |
781 | .p2align 4 | |
782 | LABEL(ashr_6): | |
618280a1 | 783 | pslldq $10, D(%xmm2) |
d9a4d2ab | 784 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
785 | pcmpeqb %xmm1, D(%xmm2) |
786 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
787 | pmovmskb %xmm2, %r9d |
788 | shr %cl, %edx | |
789 | shr %cl, %r9d | |
790 | sub %r9d, %edx | |
791 | jnz LABEL(less32bytes) | |
792 | movdqa (%rdi), %xmm3 | |
793 | ||
794 | UPDATE_STRNCMP_COUNTER | |
795 | ||
d9a4d2ab UD |
796 | mov $16, %rcx /* index for loads */ |
797 | mov $6, %r9d /* byte position left over from less32bytes case */ | |
798 | /* | |
799 | * Setup %r10 value allows us to detect crossing a page boundary. | |
800 | * When %r10 goes positive we have crossed a page boundary and | |
801 | * need to do a nibble. | |
802 | */ | |
803 | lea 6(%rdi), %r10 | |
804 | and $0xfff, %r10 /* offset into 4K page */ | |
805 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
806 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
807 | ||
808 | .p2align 4 | |
809 | LABEL(loop_ashr_6_use): | |
810 | add $16, %r10 | |
811 | jg LABEL(nibble_ashr_6_use) | |
812 | ||
813 | LABEL(nibble_ashr_6_restart_use): | |
814 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 815 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
816 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
817 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
818 | #else | |
819 | movdqa (%rsi,%rdx), %xmm1 | |
820 | TOLOWER (%xmm0, %xmm1) | |
821 | pcmpistri $0x1a, %xmm1, %xmm0 | |
822 | #endif | |
823 | jbe LABEL(exit_use) | |
824 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
825 | sub $16, %r11 | |
826 | jbe LABEL(strcmp_exitz) | |
827 | #endif | |
828 | ||
829 | add $16, %rdx | |
830 | add $16, %r10 | |
831 | jg LABEL(nibble_ashr_6_use) | |
832 | ||
833 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 834 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
835 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
836 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
837 | #else | |
838 | movdqa (%rsi,%rdx), %xmm1 | |
839 | TOLOWER (%xmm0, %xmm1) | |
840 | pcmpistri $0x1a, %xmm1, %xmm0 | |
841 | #endif | |
842 | jbe LABEL(exit_use) | |
843 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
844 | sub $16, %r11 | |
845 | jbe LABEL(strcmp_exitz) | |
846 | #endif | |
847 | add $16, %rdx | |
848 | jmp LABEL(loop_ashr_6_use) | |
849 | ||
850 | .p2align 4 | |
851 | LABEL(nibble_ashr_6_use): | |
852 | sub $0x1000, %r10 | |
853 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 854 | psrldq $6, D(%xmm0) |
d9a4d2ab UD |
855 | pcmpistri $0x3a,%xmm0, %xmm0 |
856 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
857 | cmp %r11, %rcx | |
858 | jae LABEL(nibble_ashr_exit_use) | |
859 | #endif | |
860 | cmp $9, %ecx | |
861 | ja LABEL(nibble_ashr_6_restart_use) | |
862 | ||
863 | jmp LABEL(nibble_ashr_exit_use) | |
864 | ||
865 | /* | |
866 | * The following cases will be handled by ashr_7 | |
867 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
868 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 | |
869 | */ | |
870 | .p2align 4 | |
871 | LABEL(ashr_7): | |
618280a1 | 872 | pslldq $9, D(%xmm2) |
d9a4d2ab | 873 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
874 | pcmpeqb %xmm1, D(%xmm2) |
875 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
876 | pmovmskb %xmm2, %r9d |
877 | shr %cl, %edx | |
878 | shr %cl, %r9d | |
879 | sub %r9d, %edx | |
880 | jnz LABEL(less32bytes) | |
881 | movdqa (%rdi), %xmm3 | |
882 | ||
883 | UPDATE_STRNCMP_COUNTER | |
884 | ||
d9a4d2ab UD |
885 | mov $16, %rcx /* index for loads */ |
886 | mov $7, %r9d /* byte position left over from less32bytes case */ | |
887 | /* | |
888 | * Setup %r10 value allows us to detect crossing a page boundary. | |
889 | * When %r10 goes positive we have crossed a page boundary and | |
890 | * need to do a nibble. | |
891 | */ | |
892 | lea 7(%rdi), %r10 | |
893 | and $0xfff, %r10 /* offset into 4K page */ | |
894 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
895 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
896 | ||
897 | .p2align 4 | |
898 | LABEL(loop_ashr_7_use): | |
899 | add $16, %r10 | |
900 | jg LABEL(nibble_ashr_7_use) | |
901 | ||
902 | LABEL(nibble_ashr_7_restart_use): | |
903 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 904 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
905 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
906 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
907 | #else | |
908 | movdqa (%rsi,%rdx), %xmm1 | |
909 | TOLOWER (%xmm0, %xmm1) | |
910 | pcmpistri $0x1a, %xmm1, %xmm0 | |
911 | #endif | |
912 | jbe LABEL(exit_use) | |
913 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
914 | sub $16, %r11 | |
915 | jbe LABEL(strcmp_exitz) | |
916 | #endif | |
917 | ||
918 | add $16, %rdx | |
919 | add $16, %r10 | |
920 | jg LABEL(nibble_ashr_7_use) | |
921 | ||
922 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 923 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
924 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
925 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
926 | #else | |
927 | movdqa (%rsi,%rdx), %xmm1 | |
928 | TOLOWER (%xmm0, %xmm1) | |
929 | pcmpistri $0x1a, %xmm1, %xmm0 | |
930 | #endif | |
931 | jbe LABEL(exit_use) | |
932 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
933 | sub $16, %r11 | |
934 | jbe LABEL(strcmp_exitz) | |
935 | #endif | |
936 | add $16, %rdx | |
937 | jmp LABEL(loop_ashr_7_use) | |
938 | ||
939 | .p2align 4 | |
940 | LABEL(nibble_ashr_7_use): | |
941 | sub $0x1000, %r10 | |
942 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 943 | psrldq $7, D(%xmm0) |
d9a4d2ab UD |
944 | pcmpistri $0x3a,%xmm0, %xmm0 |
945 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
946 | cmp %r11, %rcx | |
947 | jae LABEL(nibble_ashr_exit_use) | |
948 | #endif | |
949 | cmp $8, %ecx | |
950 | ja LABEL(nibble_ashr_7_restart_use) | |
951 | ||
952 | jmp LABEL(nibble_ashr_exit_use) | |
953 | ||
954 | /* | |
955 | * The following cases will be handled by ashr_8 | |
956 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
957 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 | |
958 | */ | |
959 | .p2align 4 | |
960 | LABEL(ashr_8): | |
618280a1 | 961 | pslldq $8, D(%xmm2) |
d9a4d2ab | 962 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
963 | pcmpeqb %xmm1, D(%xmm2) |
964 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
965 | pmovmskb %xmm2, %r9d |
966 | shr %cl, %edx | |
967 | shr %cl, %r9d | |
968 | sub %r9d, %edx | |
969 | jnz LABEL(less32bytes) | |
970 | movdqa (%rdi), %xmm3 | |
971 | ||
972 | UPDATE_STRNCMP_COUNTER | |
973 | ||
d9a4d2ab UD |
974 | mov $16, %rcx /* index for loads */ |
975 | mov $8, %r9d /* byte position left over from less32bytes case */ | |
976 | /* | |
977 | * Setup %r10 value allows us to detect crossing a page boundary. | |
978 | * When %r10 goes positive we have crossed a page boundary and | |
979 | * need to do a nibble. | |
980 | */ | |
981 | lea 8(%rdi), %r10 | |
982 | and $0xfff, %r10 /* offset into 4K page */ | |
983 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
984 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
985 | ||
986 | .p2align 4 | |
987 | LABEL(loop_ashr_8_use): | |
988 | add $16, %r10 | |
989 | jg LABEL(nibble_ashr_8_use) | |
990 | ||
991 | LABEL(nibble_ashr_8_restart_use): | |
992 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 993 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
994 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
995 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
996 | #else | |
997 | movdqa (%rsi,%rdx), %xmm1 | |
998 | TOLOWER (%xmm0, %xmm1) | |
999 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1000 | #endif | |
1001 | jbe LABEL(exit_use) | |
1002 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1003 | sub $16, %r11 | |
1004 | jbe LABEL(strcmp_exitz) | |
1005 | #endif | |
1006 | ||
1007 | add $16, %rdx | |
1008 | add $16, %r10 | |
1009 | jg LABEL(nibble_ashr_8_use) | |
1010 | ||
1011 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1012 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1013 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1014 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1015 | #else | |
1016 | movdqa (%rsi,%rdx), %xmm1 | |
1017 | TOLOWER (%xmm0, %xmm1) | |
1018 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1019 | #endif | |
1020 | jbe LABEL(exit_use) | |
1021 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1022 | sub $16, %r11 | |
1023 | jbe LABEL(strcmp_exitz) | |
1024 | #endif | |
1025 | add $16, %rdx | |
1026 | jmp LABEL(loop_ashr_8_use) | |
1027 | ||
1028 | .p2align 4 | |
1029 | LABEL(nibble_ashr_8_use): | |
1030 | sub $0x1000, %r10 | |
1031 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1032 | psrldq $8, D(%xmm0) |
d9a4d2ab UD |
1033 | pcmpistri $0x3a,%xmm0, %xmm0 |
1034 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1035 | cmp %r11, %rcx | |
1036 | jae LABEL(nibble_ashr_exit_use) | |
1037 | #endif | |
1038 | cmp $7, %ecx | |
1039 | ja LABEL(nibble_ashr_8_restart_use) | |
1040 | ||
1041 | jmp LABEL(nibble_ashr_exit_use) | |
1042 | ||
1043 | /* | |
1044 | * The following cases will be handled by ashr_9 | |
1045 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1046 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 | |
1047 | */ | |
1048 | .p2align 4 | |
1049 | LABEL(ashr_9): | |
618280a1 | 1050 | pslldq $7, D(%xmm2) |
d9a4d2ab | 1051 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1052 | pcmpeqb %xmm1, D(%xmm2) |
1053 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1054 | pmovmskb %xmm2, %r9d |
1055 | shr %cl, %edx | |
1056 | shr %cl, %r9d | |
1057 | sub %r9d, %edx | |
1058 | jnz LABEL(less32bytes) | |
1059 | movdqa (%rdi), %xmm3 | |
1060 | ||
1061 | UPDATE_STRNCMP_COUNTER | |
1062 | ||
d9a4d2ab UD |
1063 | mov $16, %rcx /* index for loads */ |
1064 | mov $9, %r9d /* byte position left over from less32bytes case */ | |
1065 | /* | |
1066 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1067 | * When %r10 goes positive we have crossed a page boundary and | |
1068 | * need to do a nibble. | |
1069 | */ | |
1070 | lea 9(%rdi), %r10 | |
1071 | and $0xfff, %r10 /* offset into 4K page */ | |
1072 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1073 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1074 | ||
1075 | .p2align 4 | |
1076 | LABEL(loop_ashr_9_use): | |
1077 | add $16, %r10 | |
1078 | jg LABEL(nibble_ashr_9_use) | |
1079 | ||
1080 | LABEL(nibble_ashr_9_restart_use): | |
1081 | movdqa (%rdi, %rdx), %xmm0 | |
1082 | ||
618280a1 | 1083 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1084 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1085 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1086 | #else | |
1087 | movdqa (%rsi,%rdx), %xmm1 | |
1088 | TOLOWER (%xmm0, %xmm1) | |
1089 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1090 | #endif | |
1091 | jbe LABEL(exit_use) | |
1092 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1093 | sub $16, %r11 | |
1094 | jbe LABEL(strcmp_exitz) | |
1095 | #endif | |
1096 | ||
1097 | add $16, %rdx | |
1098 | add $16, %r10 | |
1099 | jg LABEL(nibble_ashr_9_use) | |
1100 | ||
1101 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1102 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1103 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1104 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1105 | #else | |
1106 | movdqa (%rsi,%rdx), %xmm1 | |
1107 | TOLOWER (%xmm0, %xmm1) | |
1108 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1109 | #endif | |
1110 | jbe LABEL(exit_use) | |
1111 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1112 | sub $16, %r11 | |
1113 | jbe LABEL(strcmp_exitz) | |
1114 | #endif | |
1115 | add $16, %rdx | |
1116 | jmp LABEL(loop_ashr_9_use) | |
1117 | ||
1118 | .p2align 4 | |
1119 | LABEL(nibble_ashr_9_use): | |
1120 | sub $0x1000, %r10 | |
1121 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1122 | psrldq $9, D(%xmm0) |
d9a4d2ab UD |
1123 | pcmpistri $0x3a,%xmm0, %xmm0 |
1124 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1125 | cmp %r11, %rcx | |
1126 | jae LABEL(nibble_ashr_exit_use) | |
1127 | #endif | |
1128 | cmp $6, %ecx | |
1129 | ja LABEL(nibble_ashr_9_restart_use) | |
1130 | ||
1131 | jmp LABEL(nibble_ashr_exit_use) | |
1132 | ||
1133 | /* | |
1134 | * The following cases will be handled by ashr_10 | |
1135 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1136 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 | |
1137 | */ | |
1138 | .p2align 4 | |
1139 | LABEL(ashr_10): | |
618280a1 | 1140 | pslldq $6, D(%xmm2) |
d9a4d2ab | 1141 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1142 | pcmpeqb %xmm1, D(%xmm2) |
1143 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1144 | pmovmskb %xmm2, %r9d |
1145 | shr %cl, %edx | |
1146 | shr %cl, %r9d | |
1147 | sub %r9d, %edx | |
1148 | jnz LABEL(less32bytes) | |
1149 | movdqa (%rdi), %xmm3 | |
1150 | ||
1151 | UPDATE_STRNCMP_COUNTER | |
1152 | ||
d9a4d2ab UD |
1153 | mov $16, %rcx /* index for loads */ |
1154 | mov $10, %r9d /* byte position left over from less32bytes case */ | |
1155 | /* | |
1156 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1157 | * When %r10 goes positive we have crossed a page boundary and | |
1158 | * need to do a nibble. | |
1159 | */ | |
1160 | lea 10(%rdi), %r10 | |
1161 | and $0xfff, %r10 /* offset into 4K page */ | |
1162 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1163 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1164 | ||
1165 | .p2align 4 | |
1166 | LABEL(loop_ashr_10_use): | |
1167 | add $16, %r10 | |
1168 | jg LABEL(nibble_ashr_10_use) | |
1169 | ||
1170 | LABEL(nibble_ashr_10_restart_use): | |
1171 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1172 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1173 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1174 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1175 | #else | |
1176 | movdqa (%rsi,%rdx), %xmm1 | |
1177 | TOLOWER (%xmm0, %xmm1) | |
1178 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1179 | #endif | |
1180 | jbe LABEL(exit_use) | |
1181 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1182 | sub $16, %r11 | |
1183 | jbe LABEL(strcmp_exitz) | |
1184 | #endif | |
1185 | ||
1186 | add $16, %rdx | |
1187 | add $16, %r10 | |
1188 | jg LABEL(nibble_ashr_10_use) | |
1189 | ||
1190 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1191 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1192 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1193 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1194 | #else | |
1195 | movdqa (%rsi,%rdx), %xmm1 | |
1196 | TOLOWER (%xmm0, %xmm1) | |
1197 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1198 | #endif | |
1199 | jbe LABEL(exit_use) | |
1200 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1201 | sub $16, %r11 | |
1202 | jbe LABEL(strcmp_exitz) | |
1203 | #endif | |
1204 | add $16, %rdx | |
1205 | jmp LABEL(loop_ashr_10_use) | |
1206 | ||
1207 | .p2align 4 | |
1208 | LABEL(nibble_ashr_10_use): | |
1209 | sub $0x1000, %r10 | |
1210 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1211 | psrldq $10, D(%xmm0) |
d9a4d2ab UD |
1212 | pcmpistri $0x3a,%xmm0, %xmm0 |
1213 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1214 | cmp %r11, %rcx | |
1215 | jae LABEL(nibble_ashr_exit_use) | |
1216 | #endif | |
1217 | cmp $5, %ecx | |
1218 | ja LABEL(nibble_ashr_10_restart_use) | |
1219 | ||
1220 | jmp LABEL(nibble_ashr_exit_use) | |
1221 | ||
1222 | /* | |
1223 | * The following cases will be handled by ashr_11 | |
1224 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1225 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 | |
1226 | */ | |
1227 | .p2align 4 | |
1228 | LABEL(ashr_11): | |
618280a1 | 1229 | pslldq $5, D(%xmm2) |
d9a4d2ab | 1230 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1231 | pcmpeqb %xmm1, D(%xmm2) |
1232 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1233 | pmovmskb %xmm2, %r9d |
1234 | shr %cl, %edx | |
1235 | shr %cl, %r9d | |
1236 | sub %r9d, %edx | |
1237 | jnz LABEL(less32bytes) | |
1238 | movdqa (%rdi), %xmm3 | |
1239 | ||
1240 | UPDATE_STRNCMP_COUNTER | |
1241 | ||
d9a4d2ab UD |
1242 | mov $16, %rcx /* index for loads */ |
1243 | mov $11, %r9d /* byte position left over from less32bytes case */ | |
1244 | /* | |
1245 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1246 | * When %r10 goes positive we have crossed a page boundary and | |
1247 | * need to do a nibble. | |
1248 | */ | |
1249 | lea 11(%rdi), %r10 | |
1250 | and $0xfff, %r10 /* offset into 4K page */ | |
1251 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1252 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1253 | ||
1254 | .p2align 4 | |
1255 | LABEL(loop_ashr_11_use): | |
1256 | add $16, %r10 | |
1257 | jg LABEL(nibble_ashr_11_use) | |
1258 | ||
1259 | LABEL(nibble_ashr_11_restart_use): | |
1260 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1261 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1262 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1263 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1264 | #else | |
1265 | movdqa (%rsi,%rdx), %xmm1 | |
1266 | TOLOWER (%xmm0, %xmm1) | |
1267 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1268 | #endif | |
1269 | jbe LABEL(exit_use) | |
1270 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1271 | sub $16, %r11 | |
1272 | jbe LABEL(strcmp_exitz) | |
1273 | #endif | |
1274 | ||
1275 | add $16, %rdx | |
1276 | add $16, %r10 | |
1277 | jg LABEL(nibble_ashr_11_use) | |
1278 | ||
1279 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1280 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1281 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1282 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1283 | #else | |
1284 | movdqa (%rsi,%rdx), %xmm1 | |
1285 | TOLOWER (%xmm0, %xmm1) | |
1286 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1287 | #endif | |
1288 | jbe LABEL(exit_use) | |
1289 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1290 | sub $16, %r11 | |
1291 | jbe LABEL(strcmp_exitz) | |
1292 | #endif | |
1293 | add $16, %rdx | |
1294 | jmp LABEL(loop_ashr_11_use) | |
1295 | ||
1296 | .p2align 4 | |
1297 | LABEL(nibble_ashr_11_use): | |
1298 | sub $0x1000, %r10 | |
1299 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1300 | psrldq $11, D(%xmm0) |
d9a4d2ab UD |
1301 | pcmpistri $0x3a,%xmm0, %xmm0 |
1302 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1303 | cmp %r11, %rcx | |
1304 | jae LABEL(nibble_ashr_exit_use) | |
1305 | #endif | |
1306 | cmp $4, %ecx | |
1307 | ja LABEL(nibble_ashr_11_restart_use) | |
1308 | ||
1309 | jmp LABEL(nibble_ashr_exit_use) | |
1310 | ||
1311 | /* | |
1312 | * The following cases will be handled by ashr_12 | |
1313 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1314 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 | |
1315 | */ | |
1316 | .p2align 4 | |
1317 | LABEL(ashr_12): | |
618280a1 | 1318 | pslldq $4, D(%xmm2) |
d9a4d2ab | 1319 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1320 | pcmpeqb %xmm1, D(%xmm2) |
1321 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1322 | pmovmskb %xmm2, %r9d |
1323 | shr %cl, %edx | |
1324 | shr %cl, %r9d | |
1325 | sub %r9d, %edx | |
1326 | jnz LABEL(less32bytes) | |
1327 | movdqa (%rdi), %xmm3 | |
1328 | ||
1329 | UPDATE_STRNCMP_COUNTER | |
1330 | ||
d9a4d2ab UD |
1331 | mov $16, %rcx /* index for loads */ |
1332 | mov $12, %r9d /* byte position left over from less32bytes case */ | |
1333 | /* | |
1334 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1335 | * When %r10 goes positive we have crossed a page boundary and | |
1336 | * need to do a nibble. | |
1337 | */ | |
1338 | lea 12(%rdi), %r10 | |
1339 | and $0xfff, %r10 /* offset into 4K page */ | |
1340 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1341 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1342 | ||
1343 | .p2align 4 | |
1344 | LABEL(loop_ashr_12_use): | |
1345 | add $16, %r10 | |
1346 | jg LABEL(nibble_ashr_12_use) | |
1347 | ||
1348 | LABEL(nibble_ashr_12_restart_use): | |
1349 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1350 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1351 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1352 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1353 | #else | |
1354 | movdqa (%rsi,%rdx), %xmm1 | |
1355 | TOLOWER (%xmm0, %xmm1) | |
1356 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1357 | #endif | |
1358 | jbe LABEL(exit_use) | |
1359 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1360 | sub $16, %r11 | |
1361 | jbe LABEL(strcmp_exitz) | |
1362 | #endif | |
1363 | ||
1364 | add $16, %rdx | |
1365 | add $16, %r10 | |
1366 | jg LABEL(nibble_ashr_12_use) | |
1367 | ||
1368 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1369 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1370 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1371 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1372 | #else | |
1373 | movdqa (%rsi,%rdx), %xmm1 | |
1374 | TOLOWER (%xmm0, %xmm1) | |
1375 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1376 | #endif | |
1377 | jbe LABEL(exit_use) | |
1378 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1379 | sub $16, %r11 | |
1380 | jbe LABEL(strcmp_exitz) | |
1381 | #endif | |
1382 | add $16, %rdx | |
1383 | jmp LABEL(loop_ashr_12_use) | |
1384 | ||
1385 | .p2align 4 | |
1386 | LABEL(nibble_ashr_12_use): | |
1387 | sub $0x1000, %r10 | |
1388 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1389 | psrldq $12, D(%xmm0) |
d9a4d2ab UD |
1390 | pcmpistri $0x3a,%xmm0, %xmm0 |
1391 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1392 | cmp %r11, %rcx | |
1393 | jae LABEL(nibble_ashr_exit_use) | |
1394 | #endif | |
1395 | cmp $3, %ecx | |
1396 | ja LABEL(nibble_ashr_12_restart_use) | |
1397 | ||
1398 | jmp LABEL(nibble_ashr_exit_use) | |
1399 | ||
1400 | /* | |
1401 | * The following cases will be handled by ashr_13 | |
1402 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1403 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 | |
1404 | */ | |
1405 | .p2align 4 | |
1406 | LABEL(ashr_13): | |
618280a1 | 1407 | pslldq $3, D(%xmm2) |
d9a4d2ab | 1408 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1409 | pcmpeqb %xmm1, D(%xmm2) |
1410 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1411 | pmovmskb %xmm2, %r9d |
1412 | shr %cl, %edx | |
1413 | shr %cl, %r9d | |
1414 | sub %r9d, %edx | |
1415 | jnz LABEL(less32bytes) | |
1416 | movdqa (%rdi), %xmm3 | |
1417 | ||
1418 | UPDATE_STRNCMP_COUNTER | |
1419 | ||
d9a4d2ab UD |
1420 | mov $16, %rcx /* index for loads */ |
1421 | mov $13, %r9d /* byte position left over from less32bytes case */ | |
1422 | /* | |
1423 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1424 | * When %r10 goes positive we have crossed a page boundary and | |
1425 | * need to do a nibble. | |
1426 | */ | |
1427 | lea 13(%rdi), %r10 | |
1428 | and $0xfff, %r10 /* offset into 4K page */ | |
1429 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1430 | ||
1431 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1432 | ||
1433 | .p2align 4 | |
1434 | LABEL(loop_ashr_13_use): | |
1435 | add $16, %r10 | |
1436 | jg LABEL(nibble_ashr_13_use) | |
1437 | ||
1438 | LABEL(nibble_ashr_13_restart_use): | |
1439 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1440 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1441 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1442 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1443 | #else | |
1444 | movdqa (%rsi,%rdx), %xmm1 | |
1445 | TOLOWER (%xmm0, %xmm1) | |
1446 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1447 | #endif | |
1448 | jbe LABEL(exit_use) | |
1449 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1450 | sub $16, %r11 | |
1451 | jbe LABEL(strcmp_exitz) | |
1452 | #endif | |
1453 | ||
1454 | add $16, %rdx | |
1455 | add $16, %r10 | |
1456 | jg LABEL(nibble_ashr_13_use) | |
1457 | ||
1458 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1459 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1460 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1461 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1462 | #else | |
1463 | movdqa (%rsi,%rdx), %xmm1 | |
1464 | TOLOWER (%xmm0, %xmm1) | |
1465 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1466 | #endif | |
1467 | jbe LABEL(exit_use) | |
1468 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1469 | sub $16, %r11 | |
1470 | jbe LABEL(strcmp_exitz) | |
1471 | #endif | |
1472 | add $16, %rdx | |
1473 | jmp LABEL(loop_ashr_13_use) | |
1474 | ||
1475 | .p2align 4 | |
1476 | LABEL(nibble_ashr_13_use): | |
1477 | sub $0x1000, %r10 | |
1478 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1479 | psrldq $13, D(%xmm0) |
d9a4d2ab UD |
1480 | pcmpistri $0x3a,%xmm0, %xmm0 |
1481 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1482 | cmp %r11, %rcx | |
1483 | jae LABEL(nibble_ashr_exit_use) | |
1484 | #endif | |
1485 | cmp $2, %ecx | |
1486 | ja LABEL(nibble_ashr_13_restart_use) | |
1487 | ||
1488 | jmp LABEL(nibble_ashr_exit_use) | |
1489 | ||
1490 | /* | |
1491 | * The following cases will be handled by ashr_14 | |
1492 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1493 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 | |
1494 | */ | |
1495 | .p2align 4 | |
1496 | LABEL(ashr_14): | |
618280a1 | 1497 | pslldq $2, D(%xmm2) |
d9a4d2ab | 1498 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1499 | pcmpeqb %xmm1, D(%xmm2) |
1500 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1501 | pmovmskb %xmm2, %r9d |
1502 | shr %cl, %edx | |
1503 | shr %cl, %r9d | |
1504 | sub %r9d, %edx | |
1505 | jnz LABEL(less32bytes) | |
1506 | movdqa (%rdi), %xmm3 | |
1507 | ||
1508 | UPDATE_STRNCMP_COUNTER | |
1509 | ||
d9a4d2ab UD |
1510 | mov $16, %rcx /* index for loads */ |
1511 | mov $14, %r9d /* byte position left over from less32bytes case */ | |
1512 | /* | |
1513 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1514 | * When %r10 goes positive we have crossed a page boundary and | |
1515 | * need to do a nibble. | |
1516 | */ | |
1517 | lea 14(%rdi), %r10 | |
1518 | and $0xfff, %r10 /* offset into 4K page */ | |
1519 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1520 | ||
1521 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1522 | ||
1523 | .p2align 4 | |
1524 | LABEL(loop_ashr_14_use): | |
1525 | add $16, %r10 | |
1526 | jg LABEL(nibble_ashr_14_use) | |
1527 | ||
1528 | LABEL(nibble_ashr_14_restart_use): | |
1529 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1530 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1531 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1532 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1533 | #else | |
1534 | movdqa (%rsi,%rdx), %xmm1 | |
1535 | TOLOWER (%xmm0, %xmm1) | |
1536 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1537 | #endif | |
1538 | jbe LABEL(exit_use) | |
1539 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1540 | sub $16, %r11 | |
1541 | jbe LABEL(strcmp_exitz) | |
1542 | #endif | |
1543 | ||
1544 | add $16, %rdx | |
1545 | add $16, %r10 | |
1546 | jg LABEL(nibble_ashr_14_use) | |
1547 | ||
1548 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1549 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1550 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1551 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1552 | #else | |
1553 | movdqa (%rsi,%rdx), %xmm1 | |
1554 | TOLOWER (%xmm0, %xmm1) | |
1555 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1556 | #endif | |
1557 | jbe LABEL(exit_use) | |
1558 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1559 | sub $16, %r11 | |
1560 | jbe LABEL(strcmp_exitz) | |
1561 | #endif | |
1562 | add $16, %rdx | |
1563 | jmp LABEL(loop_ashr_14_use) | |
1564 | ||
1565 | .p2align 4 | |
1566 | LABEL(nibble_ashr_14_use): | |
1567 | sub $0x1000, %r10 | |
1568 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1569 | psrldq $14, D(%xmm0) |
d9a4d2ab UD |
1570 | pcmpistri $0x3a,%xmm0, %xmm0 |
1571 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1572 | cmp %r11, %rcx | |
1573 | jae LABEL(nibble_ashr_exit_use) | |
1574 | #endif | |
1575 | cmp $1, %ecx | |
1576 | ja LABEL(nibble_ashr_14_restart_use) | |
1577 | ||
1578 | jmp LABEL(nibble_ashr_exit_use) | |
1579 | ||
1580 | /* | |
1581 | * The following cases will be handled by ashr_15 | |
1582 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1583 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 | |
1584 | */ | |
1585 | .p2align 4 | |
1586 | LABEL(ashr_15): | |
618280a1 | 1587 | pslldq $1, D(%xmm2) |
d9a4d2ab | 1588 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1589 | pcmpeqb %xmm1, D(%xmm2) |
1590 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1591 | pmovmskb %xmm2, %r9d |
1592 | shr %cl, %edx | |
1593 | shr %cl, %r9d | |
1594 | sub %r9d, %edx | |
1595 | jnz LABEL(less32bytes) | |
1596 | ||
1597 | movdqa (%rdi), %xmm3 | |
1598 | ||
1599 | UPDATE_STRNCMP_COUNTER | |
1600 | ||
d9a4d2ab UD |
1601 | mov $16, %rcx /* index for loads */ |
1602 | mov $15, %r9d /* byte position left over from less32bytes case */ | |
1603 | /* | |
1604 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1605 | * When %r10 goes positive we have crossed a page boundary and | |
1606 | * need to do a nibble. | |
1607 | */ | |
1608 | lea 15(%rdi), %r10 | |
1609 | and $0xfff, %r10 /* offset into 4K page */ | |
1610 | ||
1611 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1612 | ||
1613 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1614 | ||
1615 | .p2align 4 | |
1616 | LABEL(loop_ashr_15_use): | |
1617 | add $16, %r10 | |
1618 | jg LABEL(nibble_ashr_15_use) | |
1619 | ||
1620 | LABEL(nibble_ashr_15_restart_use): | |
1621 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1622 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1623 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1624 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1625 | #else | |
1626 | movdqa (%rsi,%rdx), %xmm1 | |
1627 | TOLOWER (%xmm0, %xmm1) | |
1628 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1629 | #endif | |
1630 | jbe LABEL(exit_use) | |
1631 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1632 | sub $16, %r11 | |
1633 | jbe LABEL(strcmp_exitz) | |
1634 | #endif | |
1635 | ||
1636 | add $16, %rdx | |
1637 | add $16, %r10 | |
1638 | jg LABEL(nibble_ashr_15_use) | |
1639 | ||
1640 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1641 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1642 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1643 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1644 | #else | |
1645 | movdqa (%rsi,%rdx), %xmm1 | |
1646 | TOLOWER (%xmm0, %xmm1) | |
1647 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1648 | #endif | |
1649 | jbe LABEL(exit_use) | |
1650 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1651 | sub $16, %r11 | |
1652 | jbe LABEL(strcmp_exitz) | |
1653 | #endif | |
1654 | add $16, %rdx | |
1655 | jmp LABEL(loop_ashr_15_use) | |
1656 | ||
1657 | .p2align 4 | |
1658 | LABEL(nibble_ashr_15_use): | |
1659 | sub $0x1000, %r10 | |
1660 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1661 | psrldq $15, D(%xmm0) |
d9a4d2ab UD |
1662 | pcmpistri $0x3a,%xmm0, %xmm0 |
1663 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1664 | cmp %r11, %rcx | |
1665 | jae LABEL(nibble_ashr_exit_use) | |
1666 | #endif | |
1667 | cmp $0, %ecx | |
1668 | ja LABEL(nibble_ashr_15_restart_use) | |
1669 | ||
1670 | LABEL(nibble_ashr_exit_use): | |
1671 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1672 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
1673 | #else | |
1674 | movdqa (%rsi,%rdx), %xmm1 | |
1675 | TOLOWER (%xmm0, %xmm1) | |
1676 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1677 | #endif | |
1678 | .p2align 4 | |
1679 | LABEL(exit_use): | |
1680 | jnc LABEL(strcmp_exitz) | |
1681 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1682 | sub %rcx, %r11 | |
1683 | jbe LABEL(strcmp_exitz) | |
1684 | #endif | |
1685 | add %rcx, %rdx | |
1686 | lea -16(%rdi, %r9), %rdi | |
1687 | movzbl (%rdi, %rdx), %eax | |
1688 | movzbl (%rsi, %rdx), %edx | |
1689 | test %r8d, %r8d | |
1690 | jz LABEL(ret_use) | |
1691 | xchg %eax, %edx | |
1692 | LABEL(ret_use): | |
1693 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1694 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
1695 | movl (%rcx,%rdx,4), %edx | |
1696 | movl (%rcx,%rax,4), %eax | |
1697 | #endif | |
1698 | ||
1699 | sub %edx, %eax | |
1700 | ret | |
1701 | ||
1702 | LABEL(less32bytes): | |
1703 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ | |
1704 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ | |
1705 | test %r8d, %r8d | |
1706 | jz LABEL(ret) | |
1707 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ | |
1708 | ||
1709 | .p2align 4 | |
1710 | LABEL(ret): | |
1711 | LABEL(less16bytes): | |
1712 | bsf %rdx, %rdx /* find and store bit index in %rdx */ | |
1713 | ||
1714 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1715 | sub %rdx, %r11 | |
1716 | jbe LABEL(strcmp_exitz) | |
1717 | #endif | |
1718 | movzbl (%rsi, %rdx), %ecx | |
1719 | movzbl (%rdi, %rdx), %eax | |
1720 | ||
1721 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1722 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1723 | movl (%rdx,%rcx,4), %ecx | |
1724 | movl (%rdx,%rax,4), %eax | |
1725 | #endif | |
1726 | ||
1727 | sub %ecx, %eax | |
1728 | ret | |
1729 | ||
1730 | LABEL(strcmp_exitz): | |
1731 | xor %eax, %eax | |
1732 | ret | |
1733 | ||
1734 | .p2align 4 | |
1735 | // XXX Same as code above | |
1736 | LABEL(Byte0): | |
1737 | movzx (%rsi), %ecx | |
1738 | movzx (%rdi), %eax | |
1739 | ||
1740 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1741 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1742 | movl (%rdx,%rcx,4), %ecx | |
1743 | movl (%rdx,%rax,4), %eax | |
1744 | #endif | |
1745 | ||
1746 | sub %ecx, %eax | |
1747 | ret | |
1748 | cfi_endproc | |
1749 | .size STRCMP_SSE42, .-STRCMP_SSE42 | |
1750 | ||
1751 | #undef UCLOW_reg | |
1752 | #undef UCHIGH_reg | |
1753 | #undef LCQWORD_reg | |
1754 | #undef TOLOWER | |
1755 | ||
1756 | /* Put all SSE 4.2 functions together. */ | |
1757 | .section .rodata.SECTION,"a",@progbits | |
1758 | .p2align 3 | |
1759 | LABEL(unaligned_table): | |
1760 | .int LABEL(ashr_1) - LABEL(unaligned_table) | |
1761 | .int LABEL(ashr_2) - LABEL(unaligned_table) | |
1762 | .int LABEL(ashr_3) - LABEL(unaligned_table) | |
1763 | .int LABEL(ashr_4) - LABEL(unaligned_table) | |
1764 | .int LABEL(ashr_5) - LABEL(unaligned_table) | |
1765 | .int LABEL(ashr_6) - LABEL(unaligned_table) | |
1766 | .int LABEL(ashr_7) - LABEL(unaligned_table) | |
1767 | .int LABEL(ashr_8) - LABEL(unaligned_table) | |
1768 | .int LABEL(ashr_9) - LABEL(unaligned_table) | |
1769 | .int LABEL(ashr_10) - LABEL(unaligned_table) | |
1770 | .int LABEL(ashr_11) - LABEL(unaligned_table) | |
1771 | .int LABEL(ashr_12) - LABEL(unaligned_table) | |
1772 | .int LABEL(ashr_13) - LABEL(unaligned_table) | |
1773 | .int LABEL(ashr_14) - LABEL(unaligned_table) | |
1774 | .int LABEL(ashr_15) - LABEL(unaligned_table) | |
1775 | .int LABEL(ashr_0) - LABEL(unaligned_table) | |
1776 | ||
1777 | #undef LABEL | |
1778 | #undef GLABEL | |
1779 | #undef SECTION | |
618280a1 UD |
1780 | #undef movdqa |
1781 | #undef movdqu | |
1782 | #undef pmovmskb | |
1783 | #undef pcmpistri | |
1784 | #undef psubb | |
1785 | #undef pcmpeqb | |
1786 | #undef psrldq | |
1787 | #undef pslldq | |
1788 | #undef palignr | |
1789 | #undef pxor | |
1790 | #undef D |