]>
Commit | Line | Data |
---|---|---|
d9a4d2ab UD |
1 | /* strcmp with SSE4.2 |
2 | Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. | |
3 | Contributed by Intel Corporation. | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, write to the Free | |
18 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
19 | 02111-1307 USA. */ | |
20 | ||
21 | ||
22 | /* We use 0x1a: | |
23 | _SIDD_SBYTE_OPS | |
24 | | _SIDD_CMP_EQUAL_EACH | |
25 | | _SIDD_NEGATIVE_POLARITY | |
26 | | _SIDD_LEAST_SIGNIFICANT | |
27 | on pcmpistri to find out if two 16byte data elements are the same | |
28 | and the offset of the first different byte. There are 4 cases: | |
29 | ||
30 | 1. Both 16byte data elements are valid and identical. | |
31 | 2. Both 16byte data elements have EOS and identical. | |
32 | 3. Both 16byte data elements are valid and they differ at offset X. | |
33 | 4. At least one 16byte data element has EOS at offset X. Two 16byte | |
34 | data elements must differ at or before offset X. | |
35 | ||
36 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: | |
37 | ||
38 | case ECX CFlag ZFlag SFlag | |
39 | 1 16 0 0 0 | |
40 | 2 16 0 1 1 | |
41 | 3 X 1 0 0 | |
42 | 4 0 <= X 1 0/1 0/1 | |
43 | ||
44 | We exit from the loop for cases 2, 3 and 4 with jbe which branches | |
45 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for | |
46 | case 2. */ | |
47 | ||
48 | /* Put all SSE 4.2 functions together. */ | |
49 | .section .text.SECTION,"ax",@progbits | |
50 | .align 16 | |
51 | .type STRCMP_SSE42, @function | |
52 | #ifdef USE_AS_STRCASECMP_L | |
53 | ENTRY (GLABEL(__strcasecmp)) | |
54 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
55 | movq %fs:(%rax),%rdx | |
56 | ||
57 | // XXX 5 byte should be before the function | |
58 | /* 5-byte NOP. */ | |
59 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
60 | END (GLABEL(__strcasecmp)) | |
61 | /* FALLTHROUGH to strcasecmp_l. */ | |
62 | #endif | |
63 | #ifdef USE_AS_STRNCASECMP_L | |
64 | ENTRY (GLABEL(__strncasecmp)) | |
65 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
66 | movq %fs:(%rax),%rcx | |
67 | ||
68 | // XXX 5 byte should be before the function | |
69 | /* 5-byte NOP. */ | |
70 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
71 | END (GLABEL(__strncasecmp)) | |
72 | /* FALLTHROUGH to strncasecmp_l. */ | |
73 | #endif | |
74 | ||
75 | STRCMP_SSE42: | |
76 | cfi_startproc | |
77 | CALL_MCOUNT | |
78 | ||
79 | /* | |
80 | * This implementation uses SSE to compare up to 16 bytes at a time. | |
81 | */ | |
82 | #ifdef USE_AS_STRCASECMP_L | |
83 | /* We have to fall back on the C implementation for locales | |
84 | with encodings not matching ASCII for single bytes. */ | |
85 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
86 | movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax | |
87 | # else | |
88 | movq (%rdx), %rax | |
89 | # endif | |
90 | testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) | |
91 | jne __strcasecmp_l_nonascii | |
92 | #endif | |
93 | #ifdef USE_AS_STRNCASECMP_L | |
94 | /* We have to fall back on the C implementation for locales | |
95 | with encodings not matching ASCII for single bytes. */ | |
96 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
97 | movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax | |
98 | # else | |
99 | movq (%rcx), %rax | |
100 | # endif | |
101 | testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) | |
102 | jne __strncasecmp_l_nonascii | |
103 | #endif | |
104 | ||
105 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
106 | test %rdx, %rdx | |
107 | je LABEL(strcmp_exitz) | |
108 | cmp $1, %rdx | |
109 | je LABEL(Byte0) | |
110 | mov %rdx, %r11 | |
111 | #endif | |
112 | mov %esi, %ecx | |
113 | mov %edi, %eax | |
114 | /* Use 64bit AND here to avoid long NOP padding. */ | |
115 | and $0x3f, %rcx /* rsi alignment in cache line */ | |
116 | and $0x3f, %rax /* rdi alignment in cache line */ | |
117 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
118 | .section .rodata.cst16,"aM",@progbits,16 | |
119 | .align 16 | |
120 | LABEL(belowupper): | |
121 | .quad 0x4040404040404040 | |
122 | .quad 0x4040404040404040 | |
123 | LABEL(topupper): | |
124 | # ifdef USE_AVX | |
125 | .quad 0x5a5a5a5a5a5a5a5a | |
126 | .quad 0x5a5a5a5a5a5a5a5a | |
127 | # else | |
128 | .quad 0x5b5b5b5b5b5b5b5b | |
129 | .quad 0x5b5b5b5b5b5b5b5b | |
130 | # endif | |
131 | LABEL(touppermask): | |
132 | .quad 0x2020202020202020 | |
133 | .quad 0x2020202020202020 | |
134 | .previous | |
135 | movdqa LABEL(belowupper)(%rip), %xmm4 | |
136 | # define UCLOW_reg %xmm4 | |
137 | movdqa LABEL(topupper)(%rip), %xmm5 | |
138 | # define UCHIGH_reg %xmm5 | |
139 | movdqa LABEL(touppermask)(%rip), %xmm6 | |
140 | # define LCQWORD_reg %xmm6 | |
141 | #endif | |
142 | cmp $0x30, %ecx | |
143 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ | |
144 | cmp $0x30, %eax | |
145 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ | |
146 | movdqu (%rdi), %xmm1 | |
147 | movdqu (%rsi), %xmm2 | |
148 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
149 | # ifdef USE_AVX | |
150 | # define TOLOWER(reg1, reg2) \ | |
151 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ | |
152 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ | |
153 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ | |
154 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ | |
155 | vpandn %xmm7, %xmm8, %xmm8; \ | |
156 | vpandn %xmm9, %xmm10, %xmm10; \ | |
157 | vpand LCQWORD_reg, %xmm8, %xmm8; \ | |
158 | vpand LCQWORD_reg, %xmm10, %xmm10; \ | |
159 | vpor reg1, %xmm8, reg1; \ | |
160 | vpor reg2, %xmm10, reg2 | |
161 | # else | |
162 | # define TOLOWER(reg1, reg2) \ | |
163 | movdqa reg1, %xmm7; \ | |
164 | movdqa UCHIGH_reg, %xmm8; \ | |
165 | movdqa reg2, %xmm9; \ | |
166 | movdqa UCHIGH_reg, %xmm10; \ | |
167 | pcmpgtb UCLOW_reg, %xmm7; \ | |
168 | pcmpgtb reg1, %xmm8; \ | |
169 | pcmpgtb UCLOW_reg, %xmm9; \ | |
170 | pcmpgtb reg2, %xmm10; \ | |
171 | pand %xmm8, %xmm7; \ | |
172 | pand %xmm10, %xmm9; \ | |
173 | pand LCQWORD_reg, %xmm7; \ | |
174 | pand LCQWORD_reg, %xmm9; \ | |
175 | por %xmm7, reg1; \ | |
176 | por %xmm9, reg2 | |
177 | # endif | |
178 | TOLOWER (%xmm1, %xmm2) | |
179 | #else | |
180 | # define TOLOWER(reg1, reg2) | |
181 | #endif | |
182 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ | |
183 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
184 | pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ | |
185 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ | |
186 | pmovmskb %xmm1, %edx | |
187 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ | |
188 | jnz LABEL(less16bytes)/* If not, find different value or null char */ | |
189 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
190 | sub $16, %r11 | |
191 | jbe LABEL(strcmp_exitz)/* finish comparision */ | |
192 | #endif | |
193 | add $16, %rsi /* prepare to search next 16 bytes */ | |
194 | add $16, %rdi /* prepare to search next 16 bytes */ | |
195 | ||
196 | /* | |
197 | * Determine source and destination string offsets from 16-byte | |
198 | * alignment. Use relative offset difference between the two to | |
199 | * determine which case below to use. | |
200 | */ | |
201 | .p2align 4 | |
202 | LABEL(crosscache): | |
203 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ | |
204 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ | |
205 | mov $0xffff, %edx /* for equivalent offset */ | |
206 | xor %r8d, %r8d | |
207 | and $0xf, %ecx /* offset of rsi */ | |
208 | and $0xf, %eax /* offset of rdi */ | |
209 | cmp %eax, %ecx | |
210 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ | |
211 | ja LABEL(bigger) | |
212 | mov %edx, %r8d /* r8d is offset flag for exit tail */ | |
213 | xchg %ecx, %eax | |
214 | xchg %rsi, %rdi | |
215 | LABEL(bigger): | |
216 | lea 15(%rax), %r9 | |
217 | sub %rcx, %r9 | |
218 | lea LABEL(unaligned_table)(%rip), %r10 | |
219 | movslq (%r10, %r9,4), %r9 | |
220 | lea (%r10, %r9), %r10 | |
221 | jmp *%r10 /* jump to corresponding case */ | |
222 | ||
223 | /* | |
224 | * The following cases will be handled by ashr_0 | |
225 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
226 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 | |
227 | */ | |
228 | .p2align 4 | |
229 | LABEL(ashr_0): | |
230 | ||
231 | movdqa (%rsi), %xmm1 | |
232 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ | |
233 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
234 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
235 | pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ | |
236 | #else | |
237 | movdqa (%rdi), %xmm2 | |
238 | TOLOWER (%xmm1, %xmm2) | |
239 | pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ | |
240 | #endif | |
241 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ | |
242 | pmovmskb %xmm1, %r9d | |
243 | shr %cl, %edx /* adjust 0xffff for offset */ | |
244 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
245 | sub %r9d, %edx | |
246 | /* | |
247 | * edx must be the same with r9d if in left byte (16-rcx) is equal to | |
248 | * the start from (16-rax) and no null char was seen. | |
249 | */ | |
250 | jne LABEL(less32bytes) /* mismatch or null char */ | |
251 | UPDATE_STRNCMP_COUNTER | |
252 | mov $16, %rcx | |
253 | mov $16, %r9 | |
254 | pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ | |
255 | ||
256 | /* | |
257 | * Now both strings are aligned at 16-byte boundary. Loop over strings | |
258 | * checking 32-bytes per iteration. | |
259 | */ | |
260 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
261 | .p2align 4 | |
262 | LABEL(ashr_0_use): | |
263 | movdqa (%rdi,%rdx), %xmm0 | |
264 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
265 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
266 | #else | |
267 | movdqa (%rsi,%rdx), %xmm1 | |
268 | TOLOWER (%xmm0, %xmm1) | |
269 | pcmpistri $0x1a, %xmm1, %xmm0 | |
270 | #endif | |
271 | lea 16(%rdx), %rdx | |
272 | jbe LABEL(ashr_0_exit_use) | |
273 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
274 | sub $16, %r11 | |
275 | jbe LABEL(strcmp_exitz) | |
276 | #endif | |
277 | ||
278 | movdqa (%rdi,%rdx), %xmm0 | |
279 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
280 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
281 | #else | |
282 | movdqa (%rsi,%rdx), %xmm1 | |
283 | TOLOWER (%xmm0, %xmm1) | |
284 | pcmpistri $0x1a, %xmm1, %xmm0 | |
285 | #endif | |
286 | lea 16(%rdx), %rdx | |
287 | jbe LABEL(ashr_0_exit_use) | |
288 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
289 | sub $16, %r11 | |
290 | jbe LABEL(strcmp_exitz) | |
291 | #endif | |
292 | jmp LABEL(ashr_0_use) | |
293 | ||
294 | ||
295 | .p2align 4 | |
296 | LABEL(ashr_0_exit_use): | |
297 | jnc LABEL(strcmp_exitz) | |
298 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
299 | sub %rcx, %r11 | |
300 | jbe LABEL(strcmp_exitz) | |
301 | #endif | |
302 | lea -16(%rdx, %rcx), %rcx | |
303 | movzbl (%rdi, %rcx), %eax | |
304 | movzbl (%rsi, %rcx), %edx | |
305 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
306 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
307 | movl (%rcx,%rax,4), %eax | |
308 | movl (%rcx,%rdx,4), %edx | |
309 | #endif | |
310 | sub %edx, %eax | |
311 | ret | |
312 | ||
313 | ||
314 | ||
315 | /* | |
316 | * The following cases will be handled by ashr_1 | |
317 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
318 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 | |
319 | */ | |
320 | .p2align 4 | |
321 | LABEL(ashr_1): | |
322 | pxor %xmm0, %xmm0 | |
323 | movdqa (%rdi), %xmm2 | |
324 | movdqa (%rsi), %xmm1 | |
325 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
326 | pslldq $15, %xmm2 /* shift first string to align with second */ | |
327 | TOLOWER (%xmm1, %xmm2) | |
328 | pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ | |
329 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ | |
330 | pmovmskb %xmm2, %r9d | |
331 | shr %cl, %edx /* adjust 0xffff for offset */ | |
332 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
333 | sub %r9d, %edx | |
334 | jnz LABEL(less32bytes) /* mismatch or null char seen */ | |
335 | movdqa (%rdi), %xmm3 | |
336 | UPDATE_STRNCMP_COUNTER | |
337 | ||
338 | pxor %xmm0, %xmm0 | |
339 | mov $16, %rcx /* index for loads*/ | |
340 | mov $1, %r9d /* byte position left over from less32bytes case */ | |
341 | /* | |
342 | * Setup %r10 value allows us to detect crossing a page boundary. | |
343 | * When %r10 goes positive we have crossed a page boundary and | |
344 | * need to do a nibble. | |
345 | */ | |
346 | lea 1(%rdi), %r10 | |
347 | and $0xfff, %r10 /* offset into 4K page */ | |
348 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
349 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
350 | ||
351 | .p2align 4 | |
352 | LABEL(loop_ashr_1_use): | |
353 | add $16, %r10 | |
354 | jg LABEL(nibble_ashr_1_use) | |
355 | ||
356 | LABEL(nibble_ashr_1_restart_use): | |
357 | movdqa (%rdi, %rdx), %xmm0 | |
358 | palignr $1, -16(%rdi, %rdx), %xmm0 | |
359 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
360 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
361 | #else | |
362 | movdqa (%rsi,%rdx), %xmm1 | |
363 | TOLOWER (%xmm0, %xmm1) | |
364 | pcmpistri $0x1a, %xmm1, %xmm0 | |
365 | #endif | |
366 | jbe LABEL(exit_use) | |
367 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
368 | sub $16, %r11 | |
369 | jbe LABEL(strcmp_exitz) | |
370 | #endif | |
371 | ||
372 | add $16, %rdx | |
373 | add $16, %r10 | |
374 | jg LABEL(nibble_ashr_1_use) | |
375 | ||
376 | movdqa (%rdi, %rdx), %xmm0 | |
377 | palignr $1, -16(%rdi, %rdx), %xmm0 | |
378 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
379 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
380 | #else | |
381 | movdqa (%rsi,%rdx), %xmm1 | |
382 | TOLOWER (%xmm0, %xmm1) | |
383 | pcmpistri $0x1a, %xmm1, %xmm0 | |
384 | #endif | |
385 | jbe LABEL(exit_use) | |
386 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
387 | sub $16, %r11 | |
388 | jbe LABEL(strcmp_exitz) | |
389 | #endif | |
390 | add $16, %rdx | |
391 | jmp LABEL(loop_ashr_1_use) | |
392 | ||
393 | .p2align 4 | |
394 | LABEL(nibble_ashr_1_use): | |
395 | sub $0x1000, %r10 | |
396 | movdqa -16(%rdi, %rdx), %xmm0 | |
397 | psrldq $1, %xmm0 | |
398 | pcmpistri $0x3a,%xmm0, %xmm0 | |
399 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
400 | cmp %r11, %rcx | |
401 | jae LABEL(nibble_ashr_exit_use) | |
402 | #endif | |
403 | cmp $14, %ecx | |
404 | ja LABEL(nibble_ashr_1_restart_use) | |
405 | ||
406 | jmp LABEL(nibble_ashr_exit_use) | |
407 | ||
408 | /* | |
409 | * The following cases will be handled by ashr_2 | |
410 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
411 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 | |
412 | */ | |
413 | .p2align 4 | |
414 | LABEL(ashr_2): | |
415 | pxor %xmm0, %xmm0 | |
416 | movdqa (%rdi), %xmm2 | |
417 | movdqa (%rsi), %xmm1 | |
418 | pcmpeqb %xmm1, %xmm0 | |
419 | pslldq $14, %xmm2 | |
420 | TOLOWER (%xmm1, %xmm2) | |
421 | pcmpeqb %xmm1, %xmm2 | |
422 | psubb %xmm0, %xmm2 | |
423 | pmovmskb %xmm2, %r9d | |
424 | shr %cl, %edx | |
425 | shr %cl, %r9d | |
426 | sub %r9d, %edx | |
427 | jnz LABEL(less32bytes) | |
428 | movdqa (%rdi), %xmm3 | |
429 | UPDATE_STRNCMP_COUNTER | |
430 | ||
431 | pxor %xmm0, %xmm0 | |
432 | mov $16, %rcx /* index for loads */ | |
433 | mov $2, %r9d /* byte position left over from less32bytes case */ | |
434 | /* | |
435 | * Setup %r10 value allows us to detect crossing a page boundary. | |
436 | * When %r10 goes positive we have crossed a page boundary and | |
437 | * need to do a nibble. | |
438 | */ | |
439 | lea 2(%rdi), %r10 | |
440 | and $0xfff, %r10 /* offset into 4K page */ | |
441 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
442 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
443 | ||
444 | .p2align 4 | |
445 | LABEL(loop_ashr_2_use): | |
446 | add $16, %r10 | |
447 | jg LABEL(nibble_ashr_2_use) | |
448 | ||
449 | LABEL(nibble_ashr_2_restart_use): | |
450 | movdqa (%rdi, %rdx), %xmm0 | |
451 | palignr $2, -16(%rdi, %rdx), %xmm0 | |
452 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
453 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
454 | #else | |
455 | movdqa (%rsi,%rdx), %xmm1 | |
456 | TOLOWER (%xmm0, %xmm1) | |
457 | pcmpistri $0x1a, %xmm1, %xmm0 | |
458 | #endif | |
459 | jbe LABEL(exit_use) | |
460 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
461 | sub $16, %r11 | |
462 | jbe LABEL(strcmp_exitz) | |
463 | #endif | |
464 | ||
465 | add $16, %rdx | |
466 | add $16, %r10 | |
467 | jg LABEL(nibble_ashr_2_use) | |
468 | ||
469 | movdqa (%rdi, %rdx), %xmm0 | |
470 | palignr $2, -16(%rdi, %rdx), %xmm0 | |
471 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
472 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
473 | #else | |
474 | movdqa (%rsi,%rdx), %xmm1 | |
475 | TOLOWER (%xmm0, %xmm1) | |
476 | pcmpistri $0x1a, %xmm1, %xmm0 | |
477 | #endif | |
478 | jbe LABEL(exit_use) | |
479 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
480 | sub $16, %r11 | |
481 | jbe LABEL(strcmp_exitz) | |
482 | #endif | |
483 | add $16, %rdx | |
484 | jmp LABEL(loop_ashr_2_use) | |
485 | ||
486 | .p2align 4 | |
487 | LABEL(nibble_ashr_2_use): | |
488 | sub $0x1000, %r10 | |
489 | movdqa -16(%rdi, %rdx), %xmm0 | |
490 | psrldq $2, %xmm0 | |
491 | pcmpistri $0x3a,%xmm0, %xmm0 | |
492 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
493 | cmp %r11, %rcx | |
494 | jae LABEL(nibble_ashr_exit_use) | |
495 | #endif | |
496 | cmp $13, %ecx | |
497 | ja LABEL(nibble_ashr_2_restart_use) | |
498 | ||
499 | jmp LABEL(nibble_ashr_exit_use) | |
500 | ||
501 | /* | |
502 | * The following cases will be handled by ashr_3 | |
503 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
504 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 | |
505 | */ | |
506 | .p2align 4 | |
507 | LABEL(ashr_3): | |
508 | pxor %xmm0, %xmm0 | |
509 | movdqa (%rdi), %xmm2 | |
510 | movdqa (%rsi), %xmm1 | |
511 | pcmpeqb %xmm1, %xmm0 | |
512 | pslldq $13, %xmm2 | |
513 | TOLOWER (%xmm1, %xmm2) | |
514 | pcmpeqb %xmm1, %xmm2 | |
515 | psubb %xmm0, %xmm2 | |
516 | pmovmskb %xmm2, %r9d | |
517 | shr %cl, %edx | |
518 | shr %cl, %r9d | |
519 | sub %r9d, %edx | |
520 | jnz LABEL(less32bytes) | |
521 | movdqa (%rdi), %xmm3 | |
522 | ||
523 | UPDATE_STRNCMP_COUNTER | |
524 | ||
525 | pxor %xmm0, %xmm0 | |
526 | mov $16, %rcx /* index for loads */ | |
527 | mov $3, %r9d /* byte position left over from less32bytes case */ | |
528 | /* | |
529 | * Setup %r10 value allows us to detect crossing a page boundary. | |
530 | * When %r10 goes positive we have crossed a page boundary and | |
531 | * need to do a nibble. | |
532 | */ | |
533 | lea 3(%rdi), %r10 | |
534 | and $0xfff, %r10 /* offset into 4K page */ | |
535 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
536 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
537 | ||
538 | LABEL(loop_ashr_3_use): | |
539 | add $16, %r10 | |
540 | jg LABEL(nibble_ashr_3_use) | |
541 | ||
542 | LABEL(nibble_ashr_3_restart_use): | |
543 | movdqa (%rdi, %rdx), %xmm0 | |
544 | palignr $3, -16(%rdi, %rdx), %xmm0 | |
545 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
546 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
547 | #else | |
548 | movdqa (%rsi,%rdx), %xmm1 | |
549 | TOLOWER (%xmm0, %xmm1) | |
550 | pcmpistri $0x1a, %xmm1, %xmm0 | |
551 | #endif | |
552 | jbe LABEL(exit_use) | |
553 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
554 | sub $16, %r11 | |
555 | jbe LABEL(strcmp_exitz) | |
556 | #endif | |
557 | ||
558 | add $16, %rdx | |
559 | add $16, %r10 | |
560 | jg LABEL(nibble_ashr_3_use) | |
561 | ||
562 | movdqa (%rdi, %rdx), %xmm0 | |
563 | palignr $3, -16(%rdi, %rdx), %xmm0 | |
564 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
565 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
566 | #else | |
567 | movdqa (%rsi,%rdx), %xmm1 | |
568 | TOLOWER (%xmm0, %xmm1) | |
569 | pcmpistri $0x1a, %xmm1, %xmm0 | |
570 | #endif | |
571 | jbe LABEL(exit_use) | |
572 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
573 | sub $16, %r11 | |
574 | jbe LABEL(strcmp_exitz) | |
575 | #endif | |
576 | add $16, %rdx | |
577 | jmp LABEL(loop_ashr_3_use) | |
578 | ||
579 | .p2align 4 | |
580 | LABEL(nibble_ashr_3_use): | |
581 | sub $0x1000, %r10 | |
582 | movdqa -16(%rdi, %rdx), %xmm0 | |
583 | psrldq $3, %xmm0 | |
584 | pcmpistri $0x3a,%xmm0, %xmm0 | |
585 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
586 | cmp %r11, %rcx | |
587 | jae LABEL(nibble_ashr_exit_use) | |
588 | #endif | |
589 | cmp $12, %ecx | |
590 | ja LABEL(nibble_ashr_3_restart_use) | |
591 | ||
592 | jmp LABEL(nibble_ashr_exit_use) | |
593 | ||
594 | /* | |
595 | * The following cases will be handled by ashr_4 | |
596 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
597 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 | |
598 | */ | |
599 | .p2align 4 | |
600 | LABEL(ashr_4): | |
601 | pxor %xmm0, %xmm0 | |
602 | movdqa (%rdi), %xmm2 | |
603 | movdqa (%rsi), %xmm1 | |
604 | pcmpeqb %xmm1, %xmm0 | |
605 | pslldq $12, %xmm2 | |
606 | TOLOWER (%xmm1, %xmm2) | |
607 | pcmpeqb %xmm1, %xmm2 | |
608 | psubb %xmm0, %xmm2 | |
609 | pmovmskb %xmm2, %r9d | |
610 | shr %cl, %edx | |
611 | shr %cl, %r9d | |
612 | sub %r9d, %edx | |
613 | jnz LABEL(less32bytes) | |
614 | movdqa (%rdi), %xmm3 | |
615 | ||
616 | UPDATE_STRNCMP_COUNTER | |
617 | ||
618 | pxor %xmm0, %xmm0 | |
619 | mov $16, %rcx /* index for loads */ | |
620 | mov $4, %r9d /* byte position left over from less32bytes case */ | |
621 | /* | |
622 | * Setup %r10 value allows us to detect crossing a page boundary. | |
623 | * When %r10 goes positive we have crossed a page boundary and | |
624 | * need to do a nibble. | |
625 | */ | |
626 | lea 4(%rdi), %r10 | |
627 | and $0xfff, %r10 /* offset into 4K page */ | |
628 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
629 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
630 | ||
631 | .p2align 4 | |
632 | LABEL(loop_ashr_4_use): | |
633 | add $16, %r10 | |
634 | jg LABEL(nibble_ashr_4_use) | |
635 | ||
636 | LABEL(nibble_ashr_4_restart_use): | |
637 | movdqa (%rdi, %rdx), %xmm0 | |
638 | palignr $4, -16(%rdi, %rdx), %xmm0 | |
639 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
640 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
641 | #else | |
642 | movdqa (%rsi,%rdx), %xmm1 | |
643 | TOLOWER (%xmm0, %xmm1) | |
644 | pcmpistri $0x1a, %xmm1, %xmm0 | |
645 | #endif | |
646 | jbe LABEL(exit_use) | |
647 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
648 | sub $16, %r11 | |
649 | jbe LABEL(strcmp_exitz) | |
650 | #endif | |
651 | ||
652 | add $16, %rdx | |
653 | add $16, %r10 | |
654 | jg LABEL(nibble_ashr_4_use) | |
655 | ||
656 | movdqa (%rdi, %rdx), %xmm0 | |
657 | palignr $4, -16(%rdi, %rdx), %xmm0 | |
658 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
659 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
660 | #else | |
661 | movdqa (%rsi,%rdx), %xmm1 | |
662 | TOLOWER (%xmm0, %xmm1) | |
663 | pcmpistri $0x1a, %xmm1, %xmm0 | |
664 | #endif | |
665 | jbe LABEL(exit_use) | |
666 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
667 | sub $16, %r11 | |
668 | jbe LABEL(strcmp_exitz) | |
669 | #endif | |
670 | add $16, %rdx | |
671 | jmp LABEL(loop_ashr_4_use) | |
672 | ||
673 | .p2align 4 | |
674 | LABEL(nibble_ashr_4_use): | |
675 | sub $0x1000, %r10 | |
676 | movdqa -16(%rdi, %rdx), %xmm0 | |
677 | psrldq $4, %xmm0 | |
678 | pcmpistri $0x3a,%xmm0, %xmm0 | |
679 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
680 | cmp %r11, %rcx | |
681 | jae LABEL(nibble_ashr_exit_use) | |
682 | #endif | |
683 | cmp $11, %ecx | |
684 | ja LABEL(nibble_ashr_4_restart_use) | |
685 | ||
686 | jmp LABEL(nibble_ashr_exit_use) | |
687 | ||
688 | /* | |
689 | * The following cases will be handled by ashr_5 | |
690 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
691 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 | |
692 | */ | |
693 | .p2align 4 | |
694 | LABEL(ashr_5): | |
695 | pxor %xmm0, %xmm0 | |
696 | movdqa (%rdi), %xmm2 | |
697 | movdqa (%rsi), %xmm1 | |
698 | pcmpeqb %xmm1, %xmm0 | |
699 | pslldq $11, %xmm2 | |
700 | TOLOWER (%xmm1, %xmm2) | |
701 | pcmpeqb %xmm1, %xmm2 | |
702 | psubb %xmm0, %xmm2 | |
703 | pmovmskb %xmm2, %r9d | |
704 | shr %cl, %edx | |
705 | shr %cl, %r9d | |
706 | sub %r9d, %edx | |
707 | jnz LABEL(less32bytes) | |
708 | movdqa (%rdi), %xmm3 | |
709 | ||
710 | UPDATE_STRNCMP_COUNTER | |
711 | ||
712 | pxor %xmm0, %xmm0 | |
713 | mov $16, %rcx /* index for loads */ | |
714 | mov $5, %r9d /* byte position left over from less32bytes case */ | |
715 | /* | |
716 | * Setup %r10 value allows us to detect crossing a page boundary. | |
717 | * When %r10 goes positive we have crossed a page boundary and | |
718 | * need to do a nibble. | |
719 | */ | |
720 | lea 5(%rdi), %r10 | |
721 | and $0xfff, %r10 /* offset into 4K page */ | |
722 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
723 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
724 | ||
725 | .p2align 4 | |
726 | LABEL(loop_ashr_5_use): | |
727 | add $16, %r10 | |
728 | jg LABEL(nibble_ashr_5_use) | |
729 | ||
730 | LABEL(nibble_ashr_5_restart_use): | |
731 | movdqa (%rdi, %rdx), %xmm0 | |
732 | palignr $5, -16(%rdi, %rdx), %xmm0 | |
733 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
734 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
735 | #else | |
736 | movdqa (%rsi,%rdx), %xmm1 | |
737 | TOLOWER (%xmm0, %xmm1) | |
738 | pcmpistri $0x1a, %xmm1, %xmm0 | |
739 | #endif | |
740 | jbe LABEL(exit_use) | |
741 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
742 | sub $16, %r11 | |
743 | jbe LABEL(strcmp_exitz) | |
744 | #endif | |
745 | ||
746 | add $16, %rdx | |
747 | add $16, %r10 | |
748 | jg LABEL(nibble_ashr_5_use) | |
749 | ||
750 | movdqa (%rdi, %rdx), %xmm0 | |
751 | ||
752 | palignr $5, -16(%rdi, %rdx), %xmm0 | |
753 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
754 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
755 | #else | |
756 | movdqa (%rsi,%rdx), %xmm1 | |
757 | TOLOWER (%xmm0, %xmm1) | |
758 | pcmpistri $0x1a, %xmm1, %xmm0 | |
759 | #endif | |
760 | jbe LABEL(exit_use) | |
761 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
762 | sub $16, %r11 | |
763 | jbe LABEL(strcmp_exitz) | |
764 | #endif | |
765 | add $16, %rdx | |
766 | jmp LABEL(loop_ashr_5_use) | |
767 | ||
768 | .p2align 4 | |
769 | LABEL(nibble_ashr_5_use): | |
770 | sub $0x1000, %r10 | |
771 | movdqa -16(%rdi, %rdx), %xmm0 | |
772 | psrldq $5, %xmm0 | |
773 | pcmpistri $0x3a,%xmm0, %xmm0 | |
774 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
775 | cmp %r11, %rcx | |
776 | jae LABEL(nibble_ashr_exit_use) | |
777 | #endif | |
778 | cmp $10, %ecx | |
779 | ja LABEL(nibble_ashr_5_restart_use) | |
780 | ||
781 | jmp LABEL(nibble_ashr_exit_use) | |
782 | ||
783 | /* | |
784 | * The following cases will be handled by ashr_6 | |
785 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
786 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 | |
787 | */ | |
788 | .p2align 4 | |
789 | LABEL(ashr_6): | |
790 | pxor %xmm0, %xmm0 | |
791 | movdqa (%rdi), %xmm2 | |
792 | movdqa (%rsi), %xmm1 | |
793 | pcmpeqb %xmm1, %xmm0 | |
794 | pslldq $10, %xmm2 | |
795 | TOLOWER (%xmm1, %xmm2) | |
796 | pcmpeqb %xmm1, %xmm2 | |
797 | psubb %xmm0, %xmm2 | |
798 | pmovmskb %xmm2, %r9d | |
799 | shr %cl, %edx | |
800 | shr %cl, %r9d | |
801 | sub %r9d, %edx | |
802 | jnz LABEL(less32bytes) | |
803 | movdqa (%rdi), %xmm3 | |
804 | ||
805 | UPDATE_STRNCMP_COUNTER | |
806 | ||
807 | pxor %xmm0, %xmm0 | |
808 | mov $16, %rcx /* index for loads */ | |
809 | mov $6, %r9d /* byte position left over from less32bytes case */ | |
810 | /* | |
811 | * Setup %r10 value allows us to detect crossing a page boundary. | |
812 | * When %r10 goes positive we have crossed a page boundary and | |
813 | * need to do a nibble. | |
814 | */ | |
815 | lea 6(%rdi), %r10 | |
816 | and $0xfff, %r10 /* offset into 4K page */ | |
817 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
818 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
819 | ||
820 | .p2align 4 | |
821 | LABEL(loop_ashr_6_use): | |
822 | add $16, %r10 | |
823 | jg LABEL(nibble_ashr_6_use) | |
824 | ||
825 | LABEL(nibble_ashr_6_restart_use): | |
826 | movdqa (%rdi, %rdx), %xmm0 | |
827 | palignr $6, -16(%rdi, %rdx), %xmm0 | |
828 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
829 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
830 | #else | |
831 | movdqa (%rsi,%rdx), %xmm1 | |
832 | TOLOWER (%xmm0, %xmm1) | |
833 | pcmpistri $0x1a, %xmm1, %xmm0 | |
834 | #endif | |
835 | jbe LABEL(exit_use) | |
836 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
837 | sub $16, %r11 | |
838 | jbe LABEL(strcmp_exitz) | |
839 | #endif | |
840 | ||
841 | add $16, %rdx | |
842 | add $16, %r10 | |
843 | jg LABEL(nibble_ashr_6_use) | |
844 | ||
845 | movdqa (%rdi, %rdx), %xmm0 | |
846 | palignr $6, -16(%rdi, %rdx), %xmm0 | |
847 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
848 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
849 | #else | |
850 | movdqa (%rsi,%rdx), %xmm1 | |
851 | TOLOWER (%xmm0, %xmm1) | |
852 | pcmpistri $0x1a, %xmm1, %xmm0 | |
853 | #endif | |
854 | jbe LABEL(exit_use) | |
855 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
856 | sub $16, %r11 | |
857 | jbe LABEL(strcmp_exitz) | |
858 | #endif | |
859 | add $16, %rdx | |
860 | jmp LABEL(loop_ashr_6_use) | |
861 | ||
862 | .p2align 4 | |
863 | LABEL(nibble_ashr_6_use): | |
864 | sub $0x1000, %r10 | |
865 | movdqa -16(%rdi, %rdx), %xmm0 | |
866 | psrldq $6, %xmm0 | |
867 | pcmpistri $0x3a,%xmm0, %xmm0 | |
868 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
869 | cmp %r11, %rcx | |
870 | jae LABEL(nibble_ashr_exit_use) | |
871 | #endif | |
872 | cmp $9, %ecx | |
873 | ja LABEL(nibble_ashr_6_restart_use) | |
874 | ||
875 | jmp LABEL(nibble_ashr_exit_use) | |
876 | ||
877 | /* | |
878 | * The following cases will be handled by ashr_7 | |
879 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
880 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 | |
881 | */ | |
882 | .p2align 4 | |
883 | LABEL(ashr_7): | |
884 | pxor %xmm0, %xmm0 | |
885 | movdqa (%rdi), %xmm2 | |
886 | movdqa (%rsi), %xmm1 | |
887 | pcmpeqb %xmm1, %xmm0 | |
888 | pslldq $9, %xmm2 | |
889 | TOLOWER (%xmm1, %xmm2) | |
890 | pcmpeqb %xmm1, %xmm2 | |
891 | psubb %xmm0, %xmm2 | |
892 | pmovmskb %xmm2, %r9d | |
893 | shr %cl, %edx | |
894 | shr %cl, %r9d | |
895 | sub %r9d, %edx | |
896 | jnz LABEL(less32bytes) | |
897 | movdqa (%rdi), %xmm3 | |
898 | ||
899 | UPDATE_STRNCMP_COUNTER | |
900 | ||
901 | pxor %xmm0, %xmm0 | |
902 | mov $16, %rcx /* index for loads */ | |
903 | mov $7, %r9d /* byte position left over from less32bytes case */ | |
904 | /* | |
905 | * Setup %r10 value allows us to detect crossing a page boundary. | |
906 | * When %r10 goes positive we have crossed a page boundary and | |
907 | * need to do a nibble. | |
908 | */ | |
909 | lea 7(%rdi), %r10 | |
910 | and $0xfff, %r10 /* offset into 4K page */ | |
911 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
912 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
913 | ||
914 | .p2align 4 | |
915 | LABEL(loop_ashr_7_use): | |
916 | add $16, %r10 | |
917 | jg LABEL(nibble_ashr_7_use) | |
918 | ||
919 | LABEL(nibble_ashr_7_restart_use): | |
920 | movdqa (%rdi, %rdx), %xmm0 | |
921 | palignr $7, -16(%rdi, %rdx), %xmm0 | |
922 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
923 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
924 | #else | |
925 | movdqa (%rsi,%rdx), %xmm1 | |
926 | TOLOWER (%xmm0, %xmm1) | |
927 | pcmpistri $0x1a, %xmm1, %xmm0 | |
928 | #endif | |
929 | jbe LABEL(exit_use) | |
930 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
931 | sub $16, %r11 | |
932 | jbe LABEL(strcmp_exitz) | |
933 | #endif | |
934 | ||
935 | add $16, %rdx | |
936 | add $16, %r10 | |
937 | jg LABEL(nibble_ashr_7_use) | |
938 | ||
939 | movdqa (%rdi, %rdx), %xmm0 | |
940 | palignr $7, -16(%rdi, %rdx), %xmm0 | |
941 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
942 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
943 | #else | |
944 | movdqa (%rsi,%rdx), %xmm1 | |
945 | TOLOWER (%xmm0, %xmm1) | |
946 | pcmpistri $0x1a, %xmm1, %xmm0 | |
947 | #endif | |
948 | jbe LABEL(exit_use) | |
949 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
950 | sub $16, %r11 | |
951 | jbe LABEL(strcmp_exitz) | |
952 | #endif | |
953 | add $16, %rdx | |
954 | jmp LABEL(loop_ashr_7_use) | |
955 | ||
956 | .p2align 4 | |
957 | LABEL(nibble_ashr_7_use): | |
958 | sub $0x1000, %r10 | |
959 | movdqa -16(%rdi, %rdx), %xmm0 | |
960 | psrldq $7, %xmm0 | |
961 | pcmpistri $0x3a,%xmm0, %xmm0 | |
962 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
963 | cmp %r11, %rcx | |
964 | jae LABEL(nibble_ashr_exit_use) | |
965 | #endif | |
966 | cmp $8, %ecx | |
967 | ja LABEL(nibble_ashr_7_restart_use) | |
968 | ||
969 | jmp LABEL(nibble_ashr_exit_use) | |
970 | ||
971 | /* | |
972 | * The following cases will be handled by ashr_8 | |
973 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
974 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 | |
975 | */ | |
976 | .p2align 4 | |
977 | LABEL(ashr_8): | |
978 | pxor %xmm0, %xmm0 | |
979 | movdqa (%rdi), %xmm2 | |
980 | movdqa (%rsi), %xmm1 | |
981 | pcmpeqb %xmm1, %xmm0 | |
982 | pslldq $8, %xmm2 | |
983 | TOLOWER (%xmm1, %xmm2) | |
984 | pcmpeqb %xmm1, %xmm2 | |
985 | psubb %xmm0, %xmm2 | |
986 | pmovmskb %xmm2, %r9d | |
987 | shr %cl, %edx | |
988 | shr %cl, %r9d | |
989 | sub %r9d, %edx | |
990 | jnz LABEL(less32bytes) | |
991 | movdqa (%rdi), %xmm3 | |
992 | ||
993 | UPDATE_STRNCMP_COUNTER | |
994 | ||
995 | pxor %xmm0, %xmm0 | |
996 | mov $16, %rcx /* index for loads */ | |
997 | mov $8, %r9d /* byte position left over from less32bytes case */ | |
998 | /* | |
999 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1000 | * When %r10 goes positive we have crossed a page boundary and | |
1001 | * need to do a nibble. | |
1002 | */ | |
1003 | lea 8(%rdi), %r10 | |
1004 | and $0xfff, %r10 /* offset into 4K page */ | |
1005 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1006 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1007 | ||
1008 | .p2align 4 | |
1009 | LABEL(loop_ashr_8_use): | |
1010 | add $16, %r10 | |
1011 | jg LABEL(nibble_ashr_8_use) | |
1012 | ||
1013 | LABEL(nibble_ashr_8_restart_use): | |
1014 | movdqa (%rdi, %rdx), %xmm0 | |
1015 | palignr $8, -16(%rdi, %rdx), %xmm0 | |
1016 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1017 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1018 | #else | |
1019 | movdqa (%rsi,%rdx), %xmm1 | |
1020 | TOLOWER (%xmm0, %xmm1) | |
1021 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1022 | #endif | |
1023 | jbe LABEL(exit_use) | |
1024 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1025 | sub $16, %r11 | |
1026 | jbe LABEL(strcmp_exitz) | |
1027 | #endif | |
1028 | ||
1029 | add $16, %rdx | |
1030 | add $16, %r10 | |
1031 | jg LABEL(nibble_ashr_8_use) | |
1032 | ||
1033 | movdqa (%rdi, %rdx), %xmm0 | |
1034 | palignr $8, -16(%rdi, %rdx), %xmm0 | |
1035 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1036 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1037 | #else | |
1038 | movdqa (%rsi,%rdx), %xmm1 | |
1039 | TOLOWER (%xmm0, %xmm1) | |
1040 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1041 | #endif | |
1042 | jbe LABEL(exit_use) | |
1043 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1044 | sub $16, %r11 | |
1045 | jbe LABEL(strcmp_exitz) | |
1046 | #endif | |
1047 | add $16, %rdx | |
1048 | jmp LABEL(loop_ashr_8_use) | |
1049 | ||
1050 | .p2align 4 | |
1051 | LABEL(nibble_ashr_8_use): | |
1052 | sub $0x1000, %r10 | |
1053 | movdqa -16(%rdi, %rdx), %xmm0 | |
1054 | psrldq $8, %xmm0 | |
1055 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1056 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1057 | cmp %r11, %rcx | |
1058 | jae LABEL(nibble_ashr_exit_use) | |
1059 | #endif | |
1060 | cmp $7, %ecx | |
1061 | ja LABEL(nibble_ashr_8_restart_use) | |
1062 | ||
1063 | jmp LABEL(nibble_ashr_exit_use) | |
1064 | ||
1065 | /* | |
1066 | * The following cases will be handled by ashr_9 | |
1067 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1068 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 | |
1069 | */ | |
1070 | .p2align 4 | |
1071 | LABEL(ashr_9): | |
1072 | pxor %xmm0, %xmm0 | |
1073 | movdqa (%rdi), %xmm2 | |
1074 | movdqa (%rsi), %xmm1 | |
1075 | pcmpeqb %xmm1, %xmm0 | |
1076 | pslldq $7, %xmm2 | |
1077 | TOLOWER (%xmm1, %xmm2) | |
1078 | pcmpeqb %xmm1, %xmm2 | |
1079 | psubb %xmm0, %xmm2 | |
1080 | pmovmskb %xmm2, %r9d | |
1081 | shr %cl, %edx | |
1082 | shr %cl, %r9d | |
1083 | sub %r9d, %edx | |
1084 | jnz LABEL(less32bytes) | |
1085 | movdqa (%rdi), %xmm3 | |
1086 | ||
1087 | UPDATE_STRNCMP_COUNTER | |
1088 | ||
1089 | pxor %xmm0, %xmm0 | |
1090 | mov $16, %rcx /* index for loads */ | |
1091 | mov $9, %r9d /* byte position left over from less32bytes case */ | |
1092 | /* | |
1093 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1094 | * When %r10 goes positive we have crossed a page boundary and | |
1095 | * need to do a nibble. | |
1096 | */ | |
1097 | lea 9(%rdi), %r10 | |
1098 | and $0xfff, %r10 /* offset into 4K page */ | |
1099 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1100 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1101 | ||
1102 | .p2align 4 | |
1103 | LABEL(loop_ashr_9_use): | |
1104 | add $16, %r10 | |
1105 | jg LABEL(nibble_ashr_9_use) | |
1106 | ||
1107 | LABEL(nibble_ashr_9_restart_use): | |
1108 | movdqa (%rdi, %rdx), %xmm0 | |
1109 | ||
1110 | palignr $9, -16(%rdi, %rdx), %xmm0 | |
1111 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1112 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1113 | #else | |
1114 | movdqa (%rsi,%rdx), %xmm1 | |
1115 | TOLOWER (%xmm0, %xmm1) | |
1116 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1117 | #endif | |
1118 | jbe LABEL(exit_use) | |
1119 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1120 | sub $16, %r11 | |
1121 | jbe LABEL(strcmp_exitz) | |
1122 | #endif | |
1123 | ||
1124 | add $16, %rdx | |
1125 | add $16, %r10 | |
1126 | jg LABEL(nibble_ashr_9_use) | |
1127 | ||
1128 | movdqa (%rdi, %rdx), %xmm0 | |
1129 | palignr $9, -16(%rdi, %rdx), %xmm0 | |
1130 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1131 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1132 | #else | |
1133 | movdqa (%rsi,%rdx), %xmm1 | |
1134 | TOLOWER (%xmm0, %xmm1) | |
1135 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1136 | #endif | |
1137 | jbe LABEL(exit_use) | |
1138 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1139 | sub $16, %r11 | |
1140 | jbe LABEL(strcmp_exitz) | |
1141 | #endif | |
1142 | add $16, %rdx | |
1143 | jmp LABEL(loop_ashr_9_use) | |
1144 | ||
1145 | .p2align 4 | |
1146 | LABEL(nibble_ashr_9_use): | |
1147 | sub $0x1000, %r10 | |
1148 | movdqa -16(%rdi, %rdx), %xmm0 | |
1149 | psrldq $9, %xmm0 | |
1150 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1151 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1152 | cmp %r11, %rcx | |
1153 | jae LABEL(nibble_ashr_exit_use) | |
1154 | #endif | |
1155 | cmp $6, %ecx | |
1156 | ja LABEL(nibble_ashr_9_restart_use) | |
1157 | ||
1158 | jmp LABEL(nibble_ashr_exit_use) | |
1159 | ||
1160 | /* | |
1161 | * The following cases will be handled by ashr_10 | |
1162 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1163 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 | |
1164 | */ | |
1165 | .p2align 4 | |
1166 | LABEL(ashr_10): | |
1167 | pxor %xmm0, %xmm0 | |
1168 | movdqa (%rdi), %xmm2 | |
1169 | movdqa (%rsi), %xmm1 | |
1170 | pcmpeqb %xmm1, %xmm0 | |
1171 | pslldq $6, %xmm2 | |
1172 | TOLOWER (%xmm1, %xmm2) | |
1173 | pcmpeqb %xmm1, %xmm2 | |
1174 | psubb %xmm0, %xmm2 | |
1175 | pmovmskb %xmm2, %r9d | |
1176 | shr %cl, %edx | |
1177 | shr %cl, %r9d | |
1178 | sub %r9d, %edx | |
1179 | jnz LABEL(less32bytes) | |
1180 | movdqa (%rdi), %xmm3 | |
1181 | ||
1182 | UPDATE_STRNCMP_COUNTER | |
1183 | ||
1184 | pxor %xmm0, %xmm0 | |
1185 | mov $16, %rcx /* index for loads */ | |
1186 | mov $10, %r9d /* byte position left over from less32bytes case */ | |
1187 | /* | |
1188 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1189 | * When %r10 goes positive we have crossed a page boundary and | |
1190 | * need to do a nibble. | |
1191 | */ | |
1192 | lea 10(%rdi), %r10 | |
1193 | and $0xfff, %r10 /* offset into 4K page */ | |
1194 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1195 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1196 | ||
1197 | .p2align 4 | |
1198 | LABEL(loop_ashr_10_use): | |
1199 | add $16, %r10 | |
1200 | jg LABEL(nibble_ashr_10_use) | |
1201 | ||
1202 | LABEL(nibble_ashr_10_restart_use): | |
1203 | movdqa (%rdi, %rdx), %xmm0 | |
1204 | palignr $10, -16(%rdi, %rdx), %xmm0 | |
1205 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1206 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1207 | #else | |
1208 | movdqa (%rsi,%rdx), %xmm1 | |
1209 | TOLOWER (%xmm0, %xmm1) | |
1210 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1211 | #endif | |
1212 | jbe LABEL(exit_use) | |
1213 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1214 | sub $16, %r11 | |
1215 | jbe LABEL(strcmp_exitz) | |
1216 | #endif | |
1217 | ||
1218 | add $16, %rdx | |
1219 | add $16, %r10 | |
1220 | jg LABEL(nibble_ashr_10_use) | |
1221 | ||
1222 | movdqa (%rdi, %rdx), %xmm0 | |
1223 | palignr $10, -16(%rdi, %rdx), %xmm0 | |
1224 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1225 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1226 | #else | |
1227 | movdqa (%rsi,%rdx), %xmm1 | |
1228 | TOLOWER (%xmm0, %xmm1) | |
1229 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1230 | #endif | |
1231 | jbe LABEL(exit_use) | |
1232 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1233 | sub $16, %r11 | |
1234 | jbe LABEL(strcmp_exitz) | |
1235 | #endif | |
1236 | add $16, %rdx | |
1237 | jmp LABEL(loop_ashr_10_use) | |
1238 | ||
1239 | .p2align 4 | |
1240 | LABEL(nibble_ashr_10_use): | |
1241 | sub $0x1000, %r10 | |
1242 | movdqa -16(%rdi, %rdx), %xmm0 | |
1243 | psrldq $10, %xmm0 | |
1244 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1245 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1246 | cmp %r11, %rcx | |
1247 | jae LABEL(nibble_ashr_exit_use) | |
1248 | #endif | |
1249 | cmp $5, %ecx | |
1250 | ja LABEL(nibble_ashr_10_restart_use) | |
1251 | ||
1252 | jmp LABEL(nibble_ashr_exit_use) | |
1253 | ||
1254 | /* | |
1255 | * The following cases will be handled by ashr_11 | |
1256 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1257 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 | |
1258 | */ | |
1259 | .p2align 4 | |
1260 | LABEL(ashr_11): | |
1261 | pxor %xmm0, %xmm0 | |
1262 | movdqa (%rdi), %xmm2 | |
1263 | movdqa (%rsi), %xmm1 | |
1264 | pcmpeqb %xmm1, %xmm0 | |
1265 | pslldq $5, %xmm2 | |
1266 | TOLOWER (%xmm1, %xmm2) | |
1267 | pcmpeqb %xmm1, %xmm2 | |
1268 | psubb %xmm0, %xmm2 | |
1269 | pmovmskb %xmm2, %r9d | |
1270 | shr %cl, %edx | |
1271 | shr %cl, %r9d | |
1272 | sub %r9d, %edx | |
1273 | jnz LABEL(less32bytes) | |
1274 | movdqa (%rdi), %xmm3 | |
1275 | ||
1276 | UPDATE_STRNCMP_COUNTER | |
1277 | ||
1278 | pxor %xmm0, %xmm0 | |
1279 | mov $16, %rcx /* index for loads */ | |
1280 | mov $11, %r9d /* byte position left over from less32bytes case */ | |
1281 | /* | |
1282 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1283 | * When %r10 goes positive we have crossed a page boundary and | |
1284 | * need to do a nibble. | |
1285 | */ | |
1286 | lea 11(%rdi), %r10 | |
1287 | and $0xfff, %r10 /* offset into 4K page */ | |
1288 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1289 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1290 | ||
1291 | .p2align 4 | |
1292 | LABEL(loop_ashr_11_use): | |
1293 | add $16, %r10 | |
1294 | jg LABEL(nibble_ashr_11_use) | |
1295 | ||
1296 | LABEL(nibble_ashr_11_restart_use): | |
1297 | movdqa (%rdi, %rdx), %xmm0 | |
1298 | palignr $11, -16(%rdi, %rdx), %xmm0 | |
1299 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1300 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1301 | #else | |
1302 | movdqa (%rsi,%rdx), %xmm1 | |
1303 | TOLOWER (%xmm0, %xmm1) | |
1304 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1305 | #endif | |
1306 | jbe LABEL(exit_use) | |
1307 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1308 | sub $16, %r11 | |
1309 | jbe LABEL(strcmp_exitz) | |
1310 | #endif | |
1311 | ||
1312 | add $16, %rdx | |
1313 | add $16, %r10 | |
1314 | jg LABEL(nibble_ashr_11_use) | |
1315 | ||
1316 | movdqa (%rdi, %rdx), %xmm0 | |
1317 | palignr $11, -16(%rdi, %rdx), %xmm0 | |
1318 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1319 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1320 | #else | |
1321 | movdqa (%rsi,%rdx), %xmm1 | |
1322 | TOLOWER (%xmm0, %xmm1) | |
1323 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1324 | #endif | |
1325 | jbe LABEL(exit_use) | |
1326 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1327 | sub $16, %r11 | |
1328 | jbe LABEL(strcmp_exitz) | |
1329 | #endif | |
1330 | add $16, %rdx | |
1331 | jmp LABEL(loop_ashr_11_use) | |
1332 | ||
1333 | .p2align 4 | |
1334 | LABEL(nibble_ashr_11_use): | |
1335 | sub $0x1000, %r10 | |
1336 | movdqa -16(%rdi, %rdx), %xmm0 | |
1337 | psrldq $11, %xmm0 | |
1338 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1339 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1340 | cmp %r11, %rcx | |
1341 | jae LABEL(nibble_ashr_exit_use) | |
1342 | #endif | |
1343 | cmp $4, %ecx | |
1344 | ja LABEL(nibble_ashr_11_restart_use) | |
1345 | ||
1346 | jmp LABEL(nibble_ashr_exit_use) | |
1347 | ||
1348 | /* | |
1349 | * The following cases will be handled by ashr_12 | |
1350 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1351 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 | |
1352 | */ | |
1353 | .p2align 4 | |
1354 | LABEL(ashr_12): | |
1355 | pxor %xmm0, %xmm0 | |
1356 | movdqa (%rdi), %xmm2 | |
1357 | movdqa (%rsi), %xmm1 | |
1358 | pcmpeqb %xmm1, %xmm0 | |
1359 | pslldq $4, %xmm2 | |
1360 | TOLOWER (%xmm1, %xmm2) | |
1361 | pcmpeqb %xmm1, %xmm2 | |
1362 | psubb %xmm0, %xmm2 | |
1363 | pmovmskb %xmm2, %r9d | |
1364 | shr %cl, %edx | |
1365 | shr %cl, %r9d | |
1366 | sub %r9d, %edx | |
1367 | jnz LABEL(less32bytes) | |
1368 | movdqa (%rdi), %xmm3 | |
1369 | ||
1370 | UPDATE_STRNCMP_COUNTER | |
1371 | ||
1372 | pxor %xmm0, %xmm0 | |
1373 | mov $16, %rcx /* index for loads */ | |
1374 | mov $12, %r9d /* byte position left over from less32bytes case */ | |
1375 | /* | |
1376 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1377 | * When %r10 goes positive we have crossed a page boundary and | |
1378 | * need to do a nibble. | |
1379 | */ | |
1380 | lea 12(%rdi), %r10 | |
1381 | and $0xfff, %r10 /* offset into 4K page */ | |
1382 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1383 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1384 | ||
1385 | .p2align 4 | |
1386 | LABEL(loop_ashr_12_use): | |
1387 | add $16, %r10 | |
1388 | jg LABEL(nibble_ashr_12_use) | |
1389 | ||
1390 | LABEL(nibble_ashr_12_restart_use): | |
1391 | movdqa (%rdi, %rdx), %xmm0 | |
1392 | palignr $12, -16(%rdi, %rdx), %xmm0 | |
1393 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1394 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1395 | #else | |
1396 | movdqa (%rsi,%rdx), %xmm1 | |
1397 | TOLOWER (%xmm0, %xmm1) | |
1398 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1399 | #endif | |
1400 | jbe LABEL(exit_use) | |
1401 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1402 | sub $16, %r11 | |
1403 | jbe LABEL(strcmp_exitz) | |
1404 | #endif | |
1405 | ||
1406 | add $16, %rdx | |
1407 | add $16, %r10 | |
1408 | jg LABEL(nibble_ashr_12_use) | |
1409 | ||
1410 | movdqa (%rdi, %rdx), %xmm0 | |
1411 | palignr $12, -16(%rdi, %rdx), %xmm0 | |
1412 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1413 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1414 | #else | |
1415 | movdqa (%rsi,%rdx), %xmm1 | |
1416 | TOLOWER (%xmm0, %xmm1) | |
1417 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1418 | #endif | |
1419 | jbe LABEL(exit_use) | |
1420 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1421 | sub $16, %r11 | |
1422 | jbe LABEL(strcmp_exitz) | |
1423 | #endif | |
1424 | add $16, %rdx | |
1425 | jmp LABEL(loop_ashr_12_use) | |
1426 | ||
1427 | .p2align 4 | |
1428 | LABEL(nibble_ashr_12_use): | |
1429 | sub $0x1000, %r10 | |
1430 | movdqa -16(%rdi, %rdx), %xmm0 | |
1431 | psrldq $12, %xmm0 | |
1432 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1433 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1434 | cmp %r11, %rcx | |
1435 | jae LABEL(nibble_ashr_exit_use) | |
1436 | #endif | |
1437 | cmp $3, %ecx | |
1438 | ja LABEL(nibble_ashr_12_restart_use) | |
1439 | ||
1440 | jmp LABEL(nibble_ashr_exit_use) | |
1441 | ||
1442 | /* | |
1443 | * The following cases will be handled by ashr_13 | |
1444 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1445 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 | |
1446 | */ | |
1447 | .p2align 4 | |
1448 | LABEL(ashr_13): | |
1449 | pxor %xmm0, %xmm0 | |
1450 | movdqa (%rdi), %xmm2 | |
1451 | movdqa (%rsi), %xmm1 | |
1452 | pcmpeqb %xmm1, %xmm0 | |
1453 | pslldq $3, %xmm2 | |
1454 | TOLOWER (%xmm1, %xmm2) | |
1455 | pcmpeqb %xmm1, %xmm2 | |
1456 | psubb %xmm0, %xmm2 | |
1457 | pmovmskb %xmm2, %r9d | |
1458 | shr %cl, %edx | |
1459 | shr %cl, %r9d | |
1460 | sub %r9d, %edx | |
1461 | jnz LABEL(less32bytes) | |
1462 | movdqa (%rdi), %xmm3 | |
1463 | ||
1464 | UPDATE_STRNCMP_COUNTER | |
1465 | ||
1466 | pxor %xmm0, %xmm0 | |
1467 | mov $16, %rcx /* index for loads */ | |
1468 | mov $13, %r9d /* byte position left over from less32bytes case */ | |
1469 | /* | |
1470 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1471 | * When %r10 goes positive we have crossed a page boundary and | |
1472 | * need to do a nibble. | |
1473 | */ | |
1474 | lea 13(%rdi), %r10 | |
1475 | and $0xfff, %r10 /* offset into 4K page */ | |
1476 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1477 | ||
1478 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1479 | ||
1480 | .p2align 4 | |
1481 | LABEL(loop_ashr_13_use): | |
1482 | add $16, %r10 | |
1483 | jg LABEL(nibble_ashr_13_use) | |
1484 | ||
1485 | LABEL(nibble_ashr_13_restart_use): | |
1486 | movdqa (%rdi, %rdx), %xmm0 | |
1487 | palignr $13, -16(%rdi, %rdx), %xmm0 | |
1488 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1489 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1490 | #else | |
1491 | movdqa (%rsi,%rdx), %xmm1 | |
1492 | TOLOWER (%xmm0, %xmm1) | |
1493 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1494 | #endif | |
1495 | jbe LABEL(exit_use) | |
1496 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1497 | sub $16, %r11 | |
1498 | jbe LABEL(strcmp_exitz) | |
1499 | #endif | |
1500 | ||
1501 | add $16, %rdx | |
1502 | add $16, %r10 | |
1503 | jg LABEL(nibble_ashr_13_use) | |
1504 | ||
1505 | movdqa (%rdi, %rdx), %xmm0 | |
1506 | palignr $13, -16(%rdi, %rdx), %xmm0 | |
1507 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1508 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1509 | #else | |
1510 | movdqa (%rsi,%rdx), %xmm1 | |
1511 | TOLOWER (%xmm0, %xmm1) | |
1512 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1513 | #endif | |
1514 | jbe LABEL(exit_use) | |
1515 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1516 | sub $16, %r11 | |
1517 | jbe LABEL(strcmp_exitz) | |
1518 | #endif | |
1519 | add $16, %rdx | |
1520 | jmp LABEL(loop_ashr_13_use) | |
1521 | ||
1522 | .p2align 4 | |
1523 | LABEL(nibble_ashr_13_use): | |
1524 | sub $0x1000, %r10 | |
1525 | movdqa -16(%rdi, %rdx), %xmm0 | |
1526 | psrldq $13, %xmm0 | |
1527 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1528 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1529 | cmp %r11, %rcx | |
1530 | jae LABEL(nibble_ashr_exit_use) | |
1531 | #endif | |
1532 | cmp $2, %ecx | |
1533 | ja LABEL(nibble_ashr_13_restart_use) | |
1534 | ||
1535 | jmp LABEL(nibble_ashr_exit_use) | |
1536 | ||
1537 | /* | |
1538 | * The following cases will be handled by ashr_14 | |
1539 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1540 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 | |
1541 | */ | |
1542 | .p2align 4 | |
1543 | LABEL(ashr_14): | |
1544 | pxor %xmm0, %xmm0 | |
1545 | movdqa (%rdi), %xmm2 | |
1546 | movdqa (%rsi), %xmm1 | |
1547 | pcmpeqb %xmm1, %xmm0 | |
1548 | pslldq $2, %xmm2 | |
1549 | TOLOWER (%xmm1, %xmm2) | |
1550 | pcmpeqb %xmm1, %xmm2 | |
1551 | psubb %xmm0, %xmm2 | |
1552 | pmovmskb %xmm2, %r9d | |
1553 | shr %cl, %edx | |
1554 | shr %cl, %r9d | |
1555 | sub %r9d, %edx | |
1556 | jnz LABEL(less32bytes) | |
1557 | movdqa (%rdi), %xmm3 | |
1558 | ||
1559 | UPDATE_STRNCMP_COUNTER | |
1560 | ||
1561 | pxor %xmm0, %xmm0 | |
1562 | mov $16, %rcx /* index for loads */ | |
1563 | mov $14, %r9d /* byte position left over from less32bytes case */ | |
1564 | /* | |
1565 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1566 | * When %r10 goes positive we have crossed a page boundary and | |
1567 | * need to do a nibble. | |
1568 | */ | |
1569 | lea 14(%rdi), %r10 | |
1570 | and $0xfff, %r10 /* offset into 4K page */ | |
1571 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1572 | ||
1573 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1574 | ||
1575 | .p2align 4 | |
1576 | LABEL(loop_ashr_14_use): | |
1577 | add $16, %r10 | |
1578 | jg LABEL(nibble_ashr_14_use) | |
1579 | ||
1580 | LABEL(nibble_ashr_14_restart_use): | |
1581 | movdqa (%rdi, %rdx), %xmm0 | |
1582 | palignr $14, -16(%rdi, %rdx), %xmm0 | |
1583 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1584 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1585 | #else | |
1586 | movdqa (%rsi,%rdx), %xmm1 | |
1587 | TOLOWER (%xmm0, %xmm1) | |
1588 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1589 | #endif | |
1590 | jbe LABEL(exit_use) | |
1591 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1592 | sub $16, %r11 | |
1593 | jbe LABEL(strcmp_exitz) | |
1594 | #endif | |
1595 | ||
1596 | add $16, %rdx | |
1597 | add $16, %r10 | |
1598 | jg LABEL(nibble_ashr_14_use) | |
1599 | ||
1600 | movdqa (%rdi, %rdx), %xmm0 | |
1601 | palignr $14, -16(%rdi, %rdx), %xmm0 | |
1602 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1603 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1604 | #else | |
1605 | movdqa (%rsi,%rdx), %xmm1 | |
1606 | TOLOWER (%xmm0, %xmm1) | |
1607 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1608 | #endif | |
1609 | jbe LABEL(exit_use) | |
1610 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1611 | sub $16, %r11 | |
1612 | jbe LABEL(strcmp_exitz) | |
1613 | #endif | |
1614 | add $16, %rdx | |
1615 | jmp LABEL(loop_ashr_14_use) | |
1616 | ||
1617 | .p2align 4 | |
1618 | LABEL(nibble_ashr_14_use): | |
1619 | sub $0x1000, %r10 | |
1620 | movdqa -16(%rdi, %rdx), %xmm0 | |
1621 | psrldq $14, %xmm0 | |
1622 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1623 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1624 | cmp %r11, %rcx | |
1625 | jae LABEL(nibble_ashr_exit_use) | |
1626 | #endif | |
1627 | cmp $1, %ecx | |
1628 | ja LABEL(nibble_ashr_14_restart_use) | |
1629 | ||
1630 | jmp LABEL(nibble_ashr_exit_use) | |
1631 | ||
1632 | /* | |
1633 | * The following cases will be handled by ashr_15 | |
1634 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1635 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 | |
1636 | */ | |
1637 | .p2align 4 | |
1638 | LABEL(ashr_15): | |
1639 | pxor %xmm0, %xmm0 | |
1640 | movdqa (%rdi), %xmm2 | |
1641 | movdqa (%rsi), %xmm1 | |
1642 | pcmpeqb %xmm1, %xmm0 | |
1643 | pslldq $1, %xmm2 | |
1644 | TOLOWER (%xmm1, %xmm2) | |
1645 | pcmpeqb %xmm1, %xmm2 | |
1646 | psubb %xmm0, %xmm2 | |
1647 | pmovmskb %xmm2, %r9d | |
1648 | shr %cl, %edx | |
1649 | shr %cl, %r9d | |
1650 | sub %r9d, %edx | |
1651 | jnz LABEL(less32bytes) | |
1652 | ||
1653 | movdqa (%rdi), %xmm3 | |
1654 | ||
1655 | UPDATE_STRNCMP_COUNTER | |
1656 | ||
1657 | pxor %xmm0, %xmm0 | |
1658 | mov $16, %rcx /* index for loads */ | |
1659 | mov $15, %r9d /* byte position left over from less32bytes case */ | |
1660 | /* | |
1661 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1662 | * When %r10 goes positive we have crossed a page boundary and | |
1663 | * need to do a nibble. | |
1664 | */ | |
1665 | lea 15(%rdi), %r10 | |
1666 | and $0xfff, %r10 /* offset into 4K page */ | |
1667 | ||
1668 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1669 | ||
1670 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1671 | ||
1672 | .p2align 4 | |
1673 | LABEL(loop_ashr_15_use): | |
1674 | add $16, %r10 | |
1675 | jg LABEL(nibble_ashr_15_use) | |
1676 | ||
1677 | LABEL(nibble_ashr_15_restart_use): | |
1678 | movdqa (%rdi, %rdx), %xmm0 | |
1679 | palignr $15, -16(%rdi, %rdx), %xmm0 | |
1680 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1681 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1682 | #else | |
1683 | movdqa (%rsi,%rdx), %xmm1 | |
1684 | TOLOWER (%xmm0, %xmm1) | |
1685 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1686 | #endif | |
1687 | jbe LABEL(exit_use) | |
1688 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1689 | sub $16, %r11 | |
1690 | jbe LABEL(strcmp_exitz) | |
1691 | #endif | |
1692 | ||
1693 | add $16, %rdx | |
1694 | add $16, %r10 | |
1695 | jg LABEL(nibble_ashr_15_use) | |
1696 | ||
1697 | movdqa (%rdi, %rdx), %xmm0 | |
1698 | palignr $15, -16(%rdi, %rdx), %xmm0 | |
1699 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1700 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1701 | #else | |
1702 | movdqa (%rsi,%rdx), %xmm1 | |
1703 | TOLOWER (%xmm0, %xmm1) | |
1704 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1705 | #endif | |
1706 | jbe LABEL(exit_use) | |
1707 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1708 | sub $16, %r11 | |
1709 | jbe LABEL(strcmp_exitz) | |
1710 | #endif | |
1711 | add $16, %rdx | |
1712 | jmp LABEL(loop_ashr_15_use) | |
1713 | ||
1714 | .p2align 4 | |
1715 | LABEL(nibble_ashr_15_use): | |
1716 | sub $0x1000, %r10 | |
1717 | movdqa -16(%rdi, %rdx), %xmm0 | |
1718 | psrldq $15, %xmm0 | |
1719 | pcmpistri $0x3a,%xmm0, %xmm0 | |
1720 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1721 | cmp %r11, %rcx | |
1722 | jae LABEL(nibble_ashr_exit_use) | |
1723 | #endif | |
1724 | cmp $0, %ecx | |
1725 | ja LABEL(nibble_ashr_15_restart_use) | |
1726 | ||
1727 | LABEL(nibble_ashr_exit_use): | |
1728 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1729 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
1730 | #else | |
1731 | movdqa (%rsi,%rdx), %xmm1 | |
1732 | TOLOWER (%xmm0, %xmm1) | |
1733 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1734 | #endif | |
1735 | .p2align 4 | |
1736 | LABEL(exit_use): | |
1737 | jnc LABEL(strcmp_exitz) | |
1738 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1739 | sub %rcx, %r11 | |
1740 | jbe LABEL(strcmp_exitz) | |
1741 | #endif | |
1742 | add %rcx, %rdx | |
1743 | lea -16(%rdi, %r9), %rdi | |
1744 | movzbl (%rdi, %rdx), %eax | |
1745 | movzbl (%rsi, %rdx), %edx | |
1746 | test %r8d, %r8d | |
1747 | jz LABEL(ret_use) | |
1748 | xchg %eax, %edx | |
1749 | LABEL(ret_use): | |
1750 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1751 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
1752 | movl (%rcx,%rdx,4), %edx | |
1753 | movl (%rcx,%rax,4), %eax | |
1754 | #endif | |
1755 | ||
1756 | sub %edx, %eax | |
1757 | ret | |
1758 | ||
1759 | LABEL(less32bytes): | |
1760 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ | |
1761 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ | |
1762 | test %r8d, %r8d | |
1763 | jz LABEL(ret) | |
1764 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ | |
1765 | ||
1766 | .p2align 4 | |
1767 | LABEL(ret): | |
1768 | LABEL(less16bytes): | |
1769 | bsf %rdx, %rdx /* find and store bit index in %rdx */ | |
1770 | ||
1771 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1772 | sub %rdx, %r11 | |
1773 | jbe LABEL(strcmp_exitz) | |
1774 | #endif | |
1775 | movzbl (%rsi, %rdx), %ecx | |
1776 | movzbl (%rdi, %rdx), %eax | |
1777 | ||
1778 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1779 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1780 | movl (%rdx,%rcx,4), %ecx | |
1781 | movl (%rdx,%rax,4), %eax | |
1782 | #endif | |
1783 | ||
1784 | sub %ecx, %eax | |
1785 | ret | |
1786 | ||
1787 | LABEL(strcmp_exitz): | |
1788 | xor %eax, %eax | |
1789 | ret | |
1790 | ||
1791 | .p2align 4 | |
1792 | // XXX Same as code above | |
1793 | LABEL(Byte0): | |
1794 | movzx (%rsi), %ecx | |
1795 | movzx (%rdi), %eax | |
1796 | ||
1797 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1798 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1799 | movl (%rdx,%rcx,4), %ecx | |
1800 | movl (%rdx,%rax,4), %eax | |
1801 | #endif | |
1802 | ||
1803 | sub %ecx, %eax | |
1804 | ret | |
1805 | cfi_endproc | |
1806 | .size STRCMP_SSE42, .-STRCMP_SSE42 | |
1807 | ||
1808 | #undef UCLOW_reg | |
1809 | #undef UCHIGH_reg | |
1810 | #undef LCQWORD_reg | |
1811 | #undef TOLOWER | |
1812 | ||
1813 | /* Put all SSE 4.2 functions together. */ | |
1814 | .section .rodata.SECTION,"a",@progbits | |
1815 | .p2align 3 | |
1816 | LABEL(unaligned_table): | |
1817 | .int LABEL(ashr_1) - LABEL(unaligned_table) | |
1818 | .int LABEL(ashr_2) - LABEL(unaligned_table) | |
1819 | .int LABEL(ashr_3) - LABEL(unaligned_table) | |
1820 | .int LABEL(ashr_4) - LABEL(unaligned_table) | |
1821 | .int LABEL(ashr_5) - LABEL(unaligned_table) | |
1822 | .int LABEL(ashr_6) - LABEL(unaligned_table) | |
1823 | .int LABEL(ashr_7) - LABEL(unaligned_table) | |
1824 | .int LABEL(ashr_8) - LABEL(unaligned_table) | |
1825 | .int LABEL(ashr_9) - LABEL(unaligned_table) | |
1826 | .int LABEL(ashr_10) - LABEL(unaligned_table) | |
1827 | .int LABEL(ashr_11) - LABEL(unaligned_table) | |
1828 | .int LABEL(ashr_12) - LABEL(unaligned_table) | |
1829 | .int LABEL(ashr_13) - LABEL(unaligned_table) | |
1830 | .int LABEL(ashr_14) - LABEL(unaligned_table) | |
1831 | .int LABEL(ashr_15) - LABEL(unaligned_table) | |
1832 | .int LABEL(ashr_0) - LABEL(unaligned_table) | |
1833 | ||
1834 | #undef LABEL | |
1835 | #undef GLABEL | |
1836 | #undef SECTION |