]>
Commit | Line | Data |
---|---|---|
d9a4d2ab | 1 | /* strcmp with SSE4.2 |
04277e02 | 2 | Copyright (C) 2009-2019 Free Software Foundation, Inc. |
d9a4d2ab UD |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 17 | License along with the GNU C Library; if not, see |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
d9a4d2ab | 19 | |
11ffcacb L |
20 | #include <sysdep.h> |
21 | ||
22 | #ifndef STRCMP_SSE42 | |
23 | # define STRCMP_SSE42 __strcmp_sse42 | |
24 | #endif | |
25 | ||
26 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
27 | # include "locale-defines.h" | |
28 | #endif | |
29 | ||
30 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
31 | /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz | |
32 | if the new counter > the old one or is 0. */ | |
33 | # define UPDATE_STRNCMP_COUNTER \ | |
34 | /* calculate left number to compare */ \ | |
35 | lea -16(%rcx, %r11), %r9; \ | |
36 | cmp %r9, %r11; \ | |
37 | jb LABEL(strcmp_exitz); \ | |
38 | test %r9, %r9; \ | |
39 | je LABEL(strcmp_exitz); \ | |
40 | mov %r9, %r11 | |
41 | #else | |
42 | # define UPDATE_STRNCMP_COUNTER | |
43 | #endif | |
44 | ||
45 | #ifdef USE_AVX | |
46 | # define SECTION avx | |
47 | # define GLABEL(l) l##_avx | |
48 | #else | |
49 | # define SECTION sse4.2 | |
50 | # define GLABEL(l) l##_sse42 | |
51 | #endif | |
52 | ||
53 | #define LABEL(l) .L##l | |
d9a4d2ab UD |
54 | |
55 | /* We use 0x1a: | |
56 | _SIDD_SBYTE_OPS | |
57 | | _SIDD_CMP_EQUAL_EACH | |
58 | | _SIDD_NEGATIVE_POLARITY | |
59 | | _SIDD_LEAST_SIGNIFICANT | |
60 | on pcmpistri to find out if two 16byte data elements are the same | |
61 | and the offset of the first different byte. There are 4 cases: | |
62 | ||
63 | 1. Both 16byte data elements are valid and identical. | |
64 | 2. Both 16byte data elements have EOS and identical. | |
65 | 3. Both 16byte data elements are valid and they differ at offset X. | |
66 | 4. At least one 16byte data element has EOS at offset X. Two 16byte | |
67 | data elements must differ at or before offset X. | |
68 | ||
69 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: | |
70 | ||
71 | case ECX CFlag ZFlag SFlag | |
72 | 1 16 0 0 0 | |
73 | 2 16 0 1 1 | |
74 | 3 X 1 0 0 | |
75 | 4 0 <= X 1 0/1 0/1 | |
76 | ||
77 | We exit from the loop for cases 2, 3 and 4 with jbe which branches | |
78 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for | |
79 | case 2. */ | |
80 | ||
81 | /* Put all SSE 4.2 functions together. */ | |
82 | .section .text.SECTION,"ax",@progbits | |
83 | .align 16 | |
84 | .type STRCMP_SSE42, @function | |
ac49ecaf L |
85 | .globl STRCMP_SSE42 |
86 | .hidden STRCMP_SSE42 | |
d9a4d2ab UD |
87 | #ifdef USE_AS_STRCASECMP_L |
88 | ENTRY (GLABEL(__strcasecmp)) | |
89 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
70bc83b9 | 90 | mov %fs:(%rax),%RDX_LP |
d9a4d2ab UD |
91 | |
92 | // XXX 5 byte should be before the function | |
93 | /* 5-byte NOP. */ | |
94 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
95 | END (GLABEL(__strcasecmp)) | |
96 | /* FALLTHROUGH to strcasecmp_l. */ | |
97 | #endif | |
98 | #ifdef USE_AS_STRNCASECMP_L | |
99 | ENTRY (GLABEL(__strncasecmp)) | |
100 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax | |
70bc83b9 | 101 | mov %fs:(%rax),%RCX_LP |
d9a4d2ab UD |
102 | |
103 | // XXX 5 byte should be before the function | |
104 | /* 5-byte NOP. */ | |
105 | .byte 0x0f,0x1f,0x44,0x00,0x00 | |
106 | END (GLABEL(__strncasecmp)) | |
107 | /* FALLTHROUGH to strncasecmp_l. */ | |
108 | #endif | |
109 | ||
618280a1 UD |
110 | |
111 | #ifdef USE_AVX | |
112 | # define movdqa vmovdqa | |
113 | # define movdqu vmovdqu | |
114 | # define pmovmskb vpmovmskb | |
115 | # define pcmpistri vpcmpistri | |
116 | # define psubb vpsubb | |
117 | # define pcmpeqb vpcmpeqb | |
118 | # define psrldq vpsrldq | |
119 | # define pslldq vpslldq | |
120 | # define palignr vpalignr | |
121 | # define pxor vpxor | |
122 | # define D(arg) arg, arg | |
123 | #else | |
124 | # define D(arg) arg | |
125 | #endif | |
126 | ||
d9a4d2ab UD |
127 | STRCMP_SSE42: |
128 | cfi_startproc | |
5efc6777 | 129 | _CET_ENDBR |
d9a4d2ab UD |
130 | CALL_MCOUNT |
131 | ||
132 | /* | |
133 | * This implementation uses SSE to compare up to 16 bytes at a time. | |
134 | */ | |
135 | #ifdef USE_AS_STRCASECMP_L | |
136 | /* We have to fall back on the C implementation for locales | |
137 | with encodings not matching ASCII for single bytes. */ | |
138 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
70bc83b9 | 139 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP |
d9a4d2ab | 140 | # else |
70bc83b9 | 141 | mov (%rdx), %RAX_LP |
d9a4d2ab | 142 | # endif |
34372fc6 | 143 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
d9a4d2ab UD |
144 | jne __strcasecmp_l_nonascii |
145 | #endif | |
146 | #ifdef USE_AS_STRNCASECMP_L | |
147 | /* We have to fall back on the C implementation for locales | |
148 | with encodings not matching ASCII for single bytes. */ | |
149 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
70bc83b9 | 150 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP |
d9a4d2ab | 151 | # else |
70bc83b9 | 152 | mov (%rcx), %RAX_LP |
d9a4d2ab | 153 | # endif |
34372fc6 | 154 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
d9a4d2ab UD |
155 | jne __strncasecmp_l_nonascii |
156 | #endif | |
157 | ||
158 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
ee915088 | 159 | test %RDX_LP, %RDX_LP |
d9a4d2ab | 160 | je LABEL(strcmp_exitz) |
ee915088 | 161 | cmp $1, %RDX_LP |
d9a4d2ab | 162 | je LABEL(Byte0) |
ee915088 | 163 | mov %RDX_LP, %R11_LP |
d9a4d2ab UD |
164 | #endif |
165 | mov %esi, %ecx | |
166 | mov %edi, %eax | |
167 | /* Use 64bit AND here to avoid long NOP padding. */ | |
168 | and $0x3f, %rcx /* rsi alignment in cache line */ | |
169 | and $0x3f, %rax /* rdi alignment in cache line */ | |
170 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
171 | .section .rodata.cst16,"aM",@progbits,16 | |
172 | .align 16 | |
173 | LABEL(belowupper): | |
174 | .quad 0x4040404040404040 | |
175 | .quad 0x4040404040404040 | |
176 | LABEL(topupper): | |
177 | # ifdef USE_AVX | |
178 | .quad 0x5a5a5a5a5a5a5a5a | |
179 | .quad 0x5a5a5a5a5a5a5a5a | |
180 | # else | |
181 | .quad 0x5b5b5b5b5b5b5b5b | |
182 | .quad 0x5b5b5b5b5b5b5b5b | |
183 | # endif | |
184 | LABEL(touppermask): | |
185 | .quad 0x2020202020202020 | |
186 | .quad 0x2020202020202020 | |
187 | .previous | |
188 | movdqa LABEL(belowupper)(%rip), %xmm4 | |
189 | # define UCLOW_reg %xmm4 | |
190 | movdqa LABEL(topupper)(%rip), %xmm5 | |
191 | # define UCHIGH_reg %xmm5 | |
192 | movdqa LABEL(touppermask)(%rip), %xmm6 | |
193 | # define LCQWORD_reg %xmm6 | |
194 | #endif | |
195 | cmp $0x30, %ecx | |
196 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ | |
197 | cmp $0x30, %eax | |
198 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ | |
199 | movdqu (%rdi), %xmm1 | |
200 | movdqu (%rsi), %xmm2 | |
201 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
202 | # ifdef USE_AVX | |
203 | # define TOLOWER(reg1, reg2) \ | |
204 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ | |
205 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ | |
206 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ | |
207 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ | |
208 | vpandn %xmm7, %xmm8, %xmm8; \ | |
209 | vpandn %xmm9, %xmm10, %xmm10; \ | |
210 | vpand LCQWORD_reg, %xmm8, %xmm8; \ | |
211 | vpand LCQWORD_reg, %xmm10, %xmm10; \ | |
212 | vpor reg1, %xmm8, reg1; \ | |
213 | vpor reg2, %xmm10, reg2 | |
214 | # else | |
215 | # define TOLOWER(reg1, reg2) \ | |
216 | movdqa reg1, %xmm7; \ | |
217 | movdqa UCHIGH_reg, %xmm8; \ | |
218 | movdqa reg2, %xmm9; \ | |
219 | movdqa UCHIGH_reg, %xmm10; \ | |
220 | pcmpgtb UCLOW_reg, %xmm7; \ | |
221 | pcmpgtb reg1, %xmm8; \ | |
222 | pcmpgtb UCLOW_reg, %xmm9; \ | |
223 | pcmpgtb reg2, %xmm10; \ | |
224 | pand %xmm8, %xmm7; \ | |
225 | pand %xmm10, %xmm9; \ | |
226 | pand LCQWORD_reg, %xmm7; \ | |
227 | pand LCQWORD_reg, %xmm9; \ | |
228 | por %xmm7, reg1; \ | |
229 | por %xmm9, reg2 | |
230 | # endif | |
231 | TOLOWER (%xmm1, %xmm2) | |
232 | #else | |
233 | # define TOLOWER(reg1, reg2) | |
234 | #endif | |
618280a1 UD |
235 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ |
236 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ | |
237 | pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ | |
238 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ | |
d9a4d2ab UD |
239 | pmovmskb %xmm1, %edx |
240 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ | |
241 | jnz LABEL(less16bytes)/* If not, find different value or null char */ | |
242 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
243 | sub $16, %r11 | |
382466e0 | 244 | jbe LABEL(strcmp_exitz)/* finish comparison */ |
d9a4d2ab UD |
245 | #endif |
246 | add $16, %rsi /* prepare to search next 16 bytes */ | |
247 | add $16, %rdi /* prepare to search next 16 bytes */ | |
248 | ||
249 | /* | |
250 | * Determine source and destination string offsets from 16-byte | |
251 | * alignment. Use relative offset difference between the two to | |
252 | * determine which case below to use. | |
253 | */ | |
254 | .p2align 4 | |
255 | LABEL(crosscache): | |
256 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ | |
257 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ | |
258 | mov $0xffff, %edx /* for equivalent offset */ | |
259 | xor %r8d, %r8d | |
260 | and $0xf, %ecx /* offset of rsi */ | |
261 | and $0xf, %eax /* offset of rdi */ | |
618280a1 | 262 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ |
d9a4d2ab UD |
263 | cmp %eax, %ecx |
264 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ | |
265 | ja LABEL(bigger) | |
266 | mov %edx, %r8d /* r8d is offset flag for exit tail */ | |
267 | xchg %ecx, %eax | |
268 | xchg %rsi, %rdi | |
269 | LABEL(bigger): | |
618280a1 UD |
270 | movdqa (%rdi), %xmm2 |
271 | movdqa (%rsi), %xmm1 | |
d9a4d2ab UD |
272 | lea 15(%rax), %r9 |
273 | sub %rcx, %r9 | |
274 | lea LABEL(unaligned_table)(%rip), %r10 | |
275 | movslq (%r10, %r9,4), %r9 | |
618280a1 | 276 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
d9a4d2ab | 277 | lea (%r10, %r9), %r10 |
8817df42 | 278 | _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ |
d9a4d2ab UD |
279 | |
280 | /* | |
281 | * The following cases will be handled by ashr_0 | |
282 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
283 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 | |
284 | */ | |
285 | .p2align 4 | |
286 | LABEL(ashr_0): | |
287 | ||
288 | movdqa (%rsi), %xmm1 | |
618280a1 | 289 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
d9a4d2ab | 290 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
618280a1 | 291 | pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ |
d9a4d2ab UD |
292 | #else |
293 | movdqa (%rdi), %xmm2 | |
294 | TOLOWER (%xmm1, %xmm2) | |
618280a1 | 295 | pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ |
d9a4d2ab | 296 | #endif |
618280a1 | 297 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
d9a4d2ab UD |
298 | pmovmskb %xmm1, %r9d |
299 | shr %cl, %edx /* adjust 0xffff for offset */ | |
300 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
301 | sub %r9d, %edx | |
302 | /* | |
303 | * edx must be the same with r9d if in left byte (16-rcx) is equal to | |
304 | * the start from (16-rax) and no null char was seen. | |
305 | */ | |
306 | jne LABEL(less32bytes) /* mismatch or null char */ | |
307 | UPDATE_STRNCMP_COUNTER | |
308 | mov $16, %rcx | |
309 | mov $16, %r9 | |
d9a4d2ab UD |
310 | |
311 | /* | |
312 | * Now both strings are aligned at 16-byte boundary. Loop over strings | |
313 | * checking 32-bytes per iteration. | |
314 | */ | |
315 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
316 | .p2align 4 | |
317 | LABEL(ashr_0_use): | |
318 | movdqa (%rdi,%rdx), %xmm0 | |
319 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
320 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
321 | #else | |
322 | movdqa (%rsi,%rdx), %xmm1 | |
323 | TOLOWER (%xmm0, %xmm1) | |
324 | pcmpistri $0x1a, %xmm1, %xmm0 | |
325 | #endif | |
326 | lea 16(%rdx), %rdx | |
327 | jbe LABEL(ashr_0_exit_use) | |
328 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
329 | sub $16, %r11 | |
330 | jbe LABEL(strcmp_exitz) | |
331 | #endif | |
332 | ||
333 | movdqa (%rdi,%rdx), %xmm0 | |
334 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
335 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
336 | #else | |
337 | movdqa (%rsi,%rdx), %xmm1 | |
338 | TOLOWER (%xmm0, %xmm1) | |
339 | pcmpistri $0x1a, %xmm1, %xmm0 | |
340 | #endif | |
341 | lea 16(%rdx), %rdx | |
342 | jbe LABEL(ashr_0_exit_use) | |
343 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
344 | sub $16, %r11 | |
345 | jbe LABEL(strcmp_exitz) | |
346 | #endif | |
347 | jmp LABEL(ashr_0_use) | |
348 | ||
349 | ||
350 | .p2align 4 | |
351 | LABEL(ashr_0_exit_use): | |
352 | jnc LABEL(strcmp_exitz) | |
353 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
354 | sub %rcx, %r11 | |
355 | jbe LABEL(strcmp_exitz) | |
356 | #endif | |
357 | lea -16(%rdx, %rcx), %rcx | |
358 | movzbl (%rdi, %rcx), %eax | |
359 | movzbl (%rsi, %rcx), %edx | |
360 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
361 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
362 | movl (%rcx,%rax,4), %eax | |
363 | movl (%rcx,%rdx,4), %edx | |
364 | #endif | |
365 | sub %edx, %eax | |
366 | ret | |
367 | ||
368 | ||
369 | ||
370 | /* | |
371 | * The following cases will be handled by ashr_1 | |
372 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
373 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 | |
374 | */ | |
375 | .p2align 4 | |
376 | LABEL(ashr_1): | |
618280a1 | 377 | pslldq $15, D(%xmm2) /* shift first string to align with second */ |
d9a4d2ab | 378 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
379 | pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ |
380 | psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ | |
d9a4d2ab UD |
381 | pmovmskb %xmm2, %r9d |
382 | shr %cl, %edx /* adjust 0xffff for offset */ | |
383 | shr %cl, %r9d /* adjust for 16-byte offset */ | |
384 | sub %r9d, %edx | |
385 | jnz LABEL(less32bytes) /* mismatch or null char seen */ | |
386 | movdqa (%rdi), %xmm3 | |
387 | UPDATE_STRNCMP_COUNTER | |
388 | ||
d9a4d2ab UD |
389 | mov $16, %rcx /* index for loads*/ |
390 | mov $1, %r9d /* byte position left over from less32bytes case */ | |
391 | /* | |
392 | * Setup %r10 value allows us to detect crossing a page boundary. | |
393 | * When %r10 goes positive we have crossed a page boundary and | |
394 | * need to do a nibble. | |
395 | */ | |
396 | lea 1(%rdi), %r10 | |
397 | and $0xfff, %r10 /* offset into 4K page */ | |
398 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
399 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
400 | ||
401 | .p2align 4 | |
402 | LABEL(loop_ashr_1_use): | |
403 | add $16, %r10 | |
404 | jg LABEL(nibble_ashr_1_use) | |
405 | ||
406 | LABEL(nibble_ashr_1_restart_use): | |
407 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 408 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
409 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
410 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
411 | #else | |
412 | movdqa (%rsi,%rdx), %xmm1 | |
413 | TOLOWER (%xmm0, %xmm1) | |
414 | pcmpistri $0x1a, %xmm1, %xmm0 | |
415 | #endif | |
416 | jbe LABEL(exit_use) | |
417 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
418 | sub $16, %r11 | |
419 | jbe LABEL(strcmp_exitz) | |
420 | #endif | |
421 | ||
422 | add $16, %rdx | |
423 | add $16, %r10 | |
424 | jg LABEL(nibble_ashr_1_use) | |
425 | ||
426 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 427 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
428 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
429 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
430 | #else | |
431 | movdqa (%rsi,%rdx), %xmm1 | |
432 | TOLOWER (%xmm0, %xmm1) | |
433 | pcmpistri $0x1a, %xmm1, %xmm0 | |
434 | #endif | |
435 | jbe LABEL(exit_use) | |
436 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
437 | sub $16, %r11 | |
438 | jbe LABEL(strcmp_exitz) | |
439 | #endif | |
440 | add $16, %rdx | |
441 | jmp LABEL(loop_ashr_1_use) | |
442 | ||
443 | .p2align 4 | |
444 | LABEL(nibble_ashr_1_use): | |
445 | sub $0x1000, %r10 | |
446 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 447 | psrldq $1, D(%xmm0) |
d9a4d2ab UD |
448 | pcmpistri $0x3a,%xmm0, %xmm0 |
449 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
450 | cmp %r11, %rcx | |
451 | jae LABEL(nibble_ashr_exit_use) | |
452 | #endif | |
453 | cmp $14, %ecx | |
454 | ja LABEL(nibble_ashr_1_restart_use) | |
455 | ||
456 | jmp LABEL(nibble_ashr_exit_use) | |
457 | ||
458 | /* | |
459 | * The following cases will be handled by ashr_2 | |
460 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
461 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 | |
462 | */ | |
463 | .p2align 4 | |
464 | LABEL(ashr_2): | |
618280a1 | 465 | pslldq $14, D(%xmm2) |
d9a4d2ab | 466 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
467 | pcmpeqb %xmm1, D(%xmm2) |
468 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
469 | pmovmskb %xmm2, %r9d |
470 | shr %cl, %edx | |
471 | shr %cl, %r9d | |
472 | sub %r9d, %edx | |
473 | jnz LABEL(less32bytes) | |
474 | movdqa (%rdi), %xmm3 | |
475 | UPDATE_STRNCMP_COUNTER | |
476 | ||
d9a4d2ab UD |
477 | mov $16, %rcx /* index for loads */ |
478 | mov $2, %r9d /* byte position left over from less32bytes case */ | |
479 | /* | |
480 | * Setup %r10 value allows us to detect crossing a page boundary. | |
481 | * When %r10 goes positive we have crossed a page boundary and | |
482 | * need to do a nibble. | |
483 | */ | |
484 | lea 2(%rdi), %r10 | |
485 | and $0xfff, %r10 /* offset into 4K page */ | |
486 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
487 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
488 | ||
489 | .p2align 4 | |
490 | LABEL(loop_ashr_2_use): | |
491 | add $16, %r10 | |
492 | jg LABEL(nibble_ashr_2_use) | |
493 | ||
494 | LABEL(nibble_ashr_2_restart_use): | |
495 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 496 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
497 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
498 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
499 | #else | |
500 | movdqa (%rsi,%rdx), %xmm1 | |
501 | TOLOWER (%xmm0, %xmm1) | |
502 | pcmpistri $0x1a, %xmm1, %xmm0 | |
503 | #endif | |
504 | jbe LABEL(exit_use) | |
505 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
506 | sub $16, %r11 | |
507 | jbe LABEL(strcmp_exitz) | |
508 | #endif | |
509 | ||
510 | add $16, %rdx | |
511 | add $16, %r10 | |
512 | jg LABEL(nibble_ashr_2_use) | |
513 | ||
514 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 515 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
516 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
517 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
518 | #else | |
519 | movdqa (%rsi,%rdx), %xmm1 | |
520 | TOLOWER (%xmm0, %xmm1) | |
521 | pcmpistri $0x1a, %xmm1, %xmm0 | |
522 | #endif | |
523 | jbe LABEL(exit_use) | |
524 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
525 | sub $16, %r11 | |
526 | jbe LABEL(strcmp_exitz) | |
527 | #endif | |
528 | add $16, %rdx | |
529 | jmp LABEL(loop_ashr_2_use) | |
530 | ||
531 | .p2align 4 | |
532 | LABEL(nibble_ashr_2_use): | |
533 | sub $0x1000, %r10 | |
534 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 535 | psrldq $2, D(%xmm0) |
d9a4d2ab UD |
536 | pcmpistri $0x3a,%xmm0, %xmm0 |
537 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
538 | cmp %r11, %rcx | |
539 | jae LABEL(nibble_ashr_exit_use) | |
540 | #endif | |
541 | cmp $13, %ecx | |
542 | ja LABEL(nibble_ashr_2_restart_use) | |
543 | ||
544 | jmp LABEL(nibble_ashr_exit_use) | |
545 | ||
546 | /* | |
547 | * The following cases will be handled by ashr_3 | |
548 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
549 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 | |
550 | */ | |
551 | .p2align 4 | |
552 | LABEL(ashr_3): | |
618280a1 | 553 | pslldq $13, D(%xmm2) |
d9a4d2ab | 554 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
555 | pcmpeqb %xmm1, D(%xmm2) |
556 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
557 | pmovmskb %xmm2, %r9d |
558 | shr %cl, %edx | |
559 | shr %cl, %r9d | |
560 | sub %r9d, %edx | |
561 | jnz LABEL(less32bytes) | |
562 | movdqa (%rdi), %xmm3 | |
563 | ||
564 | UPDATE_STRNCMP_COUNTER | |
565 | ||
d9a4d2ab UD |
566 | mov $16, %rcx /* index for loads */ |
567 | mov $3, %r9d /* byte position left over from less32bytes case */ | |
568 | /* | |
569 | * Setup %r10 value allows us to detect crossing a page boundary. | |
570 | * When %r10 goes positive we have crossed a page boundary and | |
571 | * need to do a nibble. | |
572 | */ | |
573 | lea 3(%rdi), %r10 | |
574 | and $0xfff, %r10 /* offset into 4K page */ | |
575 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
576 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
577 | ||
578 | LABEL(loop_ashr_3_use): | |
579 | add $16, %r10 | |
580 | jg LABEL(nibble_ashr_3_use) | |
581 | ||
582 | LABEL(nibble_ashr_3_restart_use): | |
583 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 584 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
585 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
586 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
587 | #else | |
588 | movdqa (%rsi,%rdx), %xmm1 | |
589 | TOLOWER (%xmm0, %xmm1) | |
590 | pcmpistri $0x1a, %xmm1, %xmm0 | |
591 | #endif | |
592 | jbe LABEL(exit_use) | |
593 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
594 | sub $16, %r11 | |
595 | jbe LABEL(strcmp_exitz) | |
596 | #endif | |
597 | ||
598 | add $16, %rdx | |
599 | add $16, %r10 | |
600 | jg LABEL(nibble_ashr_3_use) | |
601 | ||
602 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 603 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
604 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
605 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
606 | #else | |
607 | movdqa (%rsi,%rdx), %xmm1 | |
608 | TOLOWER (%xmm0, %xmm1) | |
609 | pcmpistri $0x1a, %xmm1, %xmm0 | |
610 | #endif | |
611 | jbe LABEL(exit_use) | |
612 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
613 | sub $16, %r11 | |
614 | jbe LABEL(strcmp_exitz) | |
615 | #endif | |
616 | add $16, %rdx | |
617 | jmp LABEL(loop_ashr_3_use) | |
618 | ||
619 | .p2align 4 | |
620 | LABEL(nibble_ashr_3_use): | |
621 | sub $0x1000, %r10 | |
622 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 623 | psrldq $3, D(%xmm0) |
d9a4d2ab UD |
624 | pcmpistri $0x3a,%xmm0, %xmm0 |
625 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
626 | cmp %r11, %rcx | |
627 | jae LABEL(nibble_ashr_exit_use) | |
628 | #endif | |
629 | cmp $12, %ecx | |
630 | ja LABEL(nibble_ashr_3_restart_use) | |
631 | ||
632 | jmp LABEL(nibble_ashr_exit_use) | |
633 | ||
634 | /* | |
635 | * The following cases will be handled by ashr_4 | |
636 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
637 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 | |
638 | */ | |
639 | .p2align 4 | |
640 | LABEL(ashr_4): | |
618280a1 | 641 | pslldq $12, D(%xmm2) |
d9a4d2ab | 642 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
643 | pcmpeqb %xmm1, D(%xmm2) |
644 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
645 | pmovmskb %xmm2, %r9d |
646 | shr %cl, %edx | |
647 | shr %cl, %r9d | |
648 | sub %r9d, %edx | |
649 | jnz LABEL(less32bytes) | |
650 | movdqa (%rdi), %xmm3 | |
651 | ||
652 | UPDATE_STRNCMP_COUNTER | |
653 | ||
d9a4d2ab UD |
654 | mov $16, %rcx /* index for loads */ |
655 | mov $4, %r9d /* byte position left over from less32bytes case */ | |
656 | /* | |
657 | * Setup %r10 value allows us to detect crossing a page boundary. | |
658 | * When %r10 goes positive we have crossed a page boundary and | |
659 | * need to do a nibble. | |
660 | */ | |
661 | lea 4(%rdi), %r10 | |
662 | and $0xfff, %r10 /* offset into 4K page */ | |
663 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
664 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
665 | ||
666 | .p2align 4 | |
667 | LABEL(loop_ashr_4_use): | |
668 | add $16, %r10 | |
669 | jg LABEL(nibble_ashr_4_use) | |
670 | ||
671 | LABEL(nibble_ashr_4_restart_use): | |
672 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 673 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
674 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
675 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
676 | #else | |
677 | movdqa (%rsi,%rdx), %xmm1 | |
678 | TOLOWER (%xmm0, %xmm1) | |
679 | pcmpistri $0x1a, %xmm1, %xmm0 | |
680 | #endif | |
681 | jbe LABEL(exit_use) | |
682 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
683 | sub $16, %r11 | |
684 | jbe LABEL(strcmp_exitz) | |
685 | #endif | |
686 | ||
687 | add $16, %rdx | |
688 | add $16, %r10 | |
689 | jg LABEL(nibble_ashr_4_use) | |
690 | ||
691 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 692 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
693 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
694 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
695 | #else | |
696 | movdqa (%rsi,%rdx), %xmm1 | |
697 | TOLOWER (%xmm0, %xmm1) | |
698 | pcmpistri $0x1a, %xmm1, %xmm0 | |
699 | #endif | |
700 | jbe LABEL(exit_use) | |
701 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
702 | sub $16, %r11 | |
703 | jbe LABEL(strcmp_exitz) | |
704 | #endif | |
705 | add $16, %rdx | |
706 | jmp LABEL(loop_ashr_4_use) | |
707 | ||
708 | .p2align 4 | |
709 | LABEL(nibble_ashr_4_use): | |
710 | sub $0x1000, %r10 | |
711 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 712 | psrldq $4, D(%xmm0) |
d9a4d2ab UD |
713 | pcmpistri $0x3a,%xmm0, %xmm0 |
714 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
715 | cmp %r11, %rcx | |
716 | jae LABEL(nibble_ashr_exit_use) | |
717 | #endif | |
718 | cmp $11, %ecx | |
719 | ja LABEL(nibble_ashr_4_restart_use) | |
720 | ||
721 | jmp LABEL(nibble_ashr_exit_use) | |
722 | ||
723 | /* | |
724 | * The following cases will be handled by ashr_5 | |
725 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
726 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 | |
727 | */ | |
728 | .p2align 4 | |
729 | LABEL(ashr_5): | |
618280a1 | 730 | pslldq $11, D(%xmm2) |
d9a4d2ab | 731 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
732 | pcmpeqb %xmm1, D(%xmm2) |
733 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
734 | pmovmskb %xmm2, %r9d |
735 | shr %cl, %edx | |
736 | shr %cl, %r9d | |
737 | sub %r9d, %edx | |
738 | jnz LABEL(less32bytes) | |
739 | movdqa (%rdi), %xmm3 | |
740 | ||
741 | UPDATE_STRNCMP_COUNTER | |
742 | ||
d9a4d2ab UD |
743 | mov $16, %rcx /* index for loads */ |
744 | mov $5, %r9d /* byte position left over from less32bytes case */ | |
745 | /* | |
746 | * Setup %r10 value allows us to detect crossing a page boundary. | |
747 | * When %r10 goes positive we have crossed a page boundary and | |
748 | * need to do a nibble. | |
749 | */ | |
750 | lea 5(%rdi), %r10 | |
751 | and $0xfff, %r10 /* offset into 4K page */ | |
752 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
753 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
754 | ||
755 | .p2align 4 | |
756 | LABEL(loop_ashr_5_use): | |
757 | add $16, %r10 | |
758 | jg LABEL(nibble_ashr_5_use) | |
759 | ||
760 | LABEL(nibble_ashr_5_restart_use): | |
761 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 762 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
763 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
764 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
765 | #else | |
766 | movdqa (%rsi,%rdx), %xmm1 | |
767 | TOLOWER (%xmm0, %xmm1) | |
768 | pcmpistri $0x1a, %xmm1, %xmm0 | |
769 | #endif | |
770 | jbe LABEL(exit_use) | |
771 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
772 | sub $16, %r11 | |
773 | jbe LABEL(strcmp_exitz) | |
774 | #endif | |
775 | ||
776 | add $16, %rdx | |
777 | add $16, %r10 | |
778 | jg LABEL(nibble_ashr_5_use) | |
779 | ||
780 | movdqa (%rdi, %rdx), %xmm0 | |
781 | ||
618280a1 | 782 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
783 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
784 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
785 | #else | |
786 | movdqa (%rsi,%rdx), %xmm1 | |
787 | TOLOWER (%xmm0, %xmm1) | |
788 | pcmpistri $0x1a, %xmm1, %xmm0 | |
789 | #endif | |
790 | jbe LABEL(exit_use) | |
791 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
792 | sub $16, %r11 | |
793 | jbe LABEL(strcmp_exitz) | |
794 | #endif | |
795 | add $16, %rdx | |
796 | jmp LABEL(loop_ashr_5_use) | |
797 | ||
798 | .p2align 4 | |
799 | LABEL(nibble_ashr_5_use): | |
800 | sub $0x1000, %r10 | |
801 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 802 | psrldq $5, D(%xmm0) |
d9a4d2ab UD |
803 | pcmpistri $0x3a,%xmm0, %xmm0 |
804 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
805 | cmp %r11, %rcx | |
806 | jae LABEL(nibble_ashr_exit_use) | |
807 | #endif | |
808 | cmp $10, %ecx | |
809 | ja LABEL(nibble_ashr_5_restart_use) | |
810 | ||
811 | jmp LABEL(nibble_ashr_exit_use) | |
812 | ||
813 | /* | |
814 | * The following cases will be handled by ashr_6 | |
815 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
816 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 | |
817 | */ | |
818 | .p2align 4 | |
819 | LABEL(ashr_6): | |
618280a1 | 820 | pslldq $10, D(%xmm2) |
d9a4d2ab | 821 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
822 | pcmpeqb %xmm1, D(%xmm2) |
823 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
824 | pmovmskb %xmm2, %r9d |
825 | shr %cl, %edx | |
826 | shr %cl, %r9d | |
827 | sub %r9d, %edx | |
828 | jnz LABEL(less32bytes) | |
829 | movdqa (%rdi), %xmm3 | |
830 | ||
831 | UPDATE_STRNCMP_COUNTER | |
832 | ||
d9a4d2ab UD |
833 | mov $16, %rcx /* index for loads */ |
834 | mov $6, %r9d /* byte position left over from less32bytes case */ | |
835 | /* | |
836 | * Setup %r10 value allows us to detect crossing a page boundary. | |
837 | * When %r10 goes positive we have crossed a page boundary and | |
838 | * need to do a nibble. | |
839 | */ | |
840 | lea 6(%rdi), %r10 | |
841 | and $0xfff, %r10 /* offset into 4K page */ | |
842 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
843 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
844 | ||
845 | .p2align 4 | |
846 | LABEL(loop_ashr_6_use): | |
847 | add $16, %r10 | |
848 | jg LABEL(nibble_ashr_6_use) | |
849 | ||
850 | LABEL(nibble_ashr_6_restart_use): | |
851 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 852 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
853 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
854 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
855 | #else | |
856 | movdqa (%rsi,%rdx), %xmm1 | |
857 | TOLOWER (%xmm0, %xmm1) | |
858 | pcmpistri $0x1a, %xmm1, %xmm0 | |
859 | #endif | |
860 | jbe LABEL(exit_use) | |
861 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
862 | sub $16, %r11 | |
863 | jbe LABEL(strcmp_exitz) | |
864 | #endif | |
865 | ||
866 | add $16, %rdx | |
867 | add $16, %r10 | |
868 | jg LABEL(nibble_ashr_6_use) | |
869 | ||
870 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 871 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
872 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
873 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
874 | #else | |
875 | movdqa (%rsi,%rdx), %xmm1 | |
876 | TOLOWER (%xmm0, %xmm1) | |
877 | pcmpistri $0x1a, %xmm1, %xmm0 | |
878 | #endif | |
879 | jbe LABEL(exit_use) | |
880 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
881 | sub $16, %r11 | |
882 | jbe LABEL(strcmp_exitz) | |
883 | #endif | |
884 | add $16, %rdx | |
885 | jmp LABEL(loop_ashr_6_use) | |
886 | ||
887 | .p2align 4 | |
888 | LABEL(nibble_ashr_6_use): | |
889 | sub $0x1000, %r10 | |
890 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 891 | psrldq $6, D(%xmm0) |
d9a4d2ab UD |
892 | pcmpistri $0x3a,%xmm0, %xmm0 |
893 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
894 | cmp %r11, %rcx | |
895 | jae LABEL(nibble_ashr_exit_use) | |
896 | #endif | |
897 | cmp $9, %ecx | |
898 | ja LABEL(nibble_ashr_6_restart_use) | |
899 | ||
900 | jmp LABEL(nibble_ashr_exit_use) | |
901 | ||
902 | /* | |
903 | * The following cases will be handled by ashr_7 | |
904 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
905 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 | |
906 | */ | |
907 | .p2align 4 | |
908 | LABEL(ashr_7): | |
618280a1 | 909 | pslldq $9, D(%xmm2) |
d9a4d2ab | 910 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
911 | pcmpeqb %xmm1, D(%xmm2) |
912 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
913 | pmovmskb %xmm2, %r9d |
914 | shr %cl, %edx | |
915 | shr %cl, %r9d | |
916 | sub %r9d, %edx | |
917 | jnz LABEL(less32bytes) | |
918 | movdqa (%rdi), %xmm3 | |
919 | ||
920 | UPDATE_STRNCMP_COUNTER | |
921 | ||
d9a4d2ab UD |
922 | mov $16, %rcx /* index for loads */ |
923 | mov $7, %r9d /* byte position left over from less32bytes case */ | |
924 | /* | |
925 | * Setup %r10 value allows us to detect crossing a page boundary. | |
926 | * When %r10 goes positive we have crossed a page boundary and | |
927 | * need to do a nibble. | |
928 | */ | |
929 | lea 7(%rdi), %r10 | |
930 | and $0xfff, %r10 /* offset into 4K page */ | |
931 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
932 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
933 | ||
934 | .p2align 4 | |
935 | LABEL(loop_ashr_7_use): | |
936 | add $16, %r10 | |
937 | jg LABEL(nibble_ashr_7_use) | |
938 | ||
939 | LABEL(nibble_ashr_7_restart_use): | |
940 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 941 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
942 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
943 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
944 | #else | |
945 | movdqa (%rsi,%rdx), %xmm1 | |
946 | TOLOWER (%xmm0, %xmm1) | |
947 | pcmpistri $0x1a, %xmm1, %xmm0 | |
948 | #endif | |
949 | jbe LABEL(exit_use) | |
950 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
951 | sub $16, %r11 | |
952 | jbe LABEL(strcmp_exitz) | |
953 | #endif | |
954 | ||
955 | add $16, %rdx | |
956 | add $16, %r10 | |
957 | jg LABEL(nibble_ashr_7_use) | |
958 | ||
959 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 960 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
961 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
962 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
963 | #else | |
964 | movdqa (%rsi,%rdx), %xmm1 | |
965 | TOLOWER (%xmm0, %xmm1) | |
966 | pcmpistri $0x1a, %xmm1, %xmm0 | |
967 | #endif | |
968 | jbe LABEL(exit_use) | |
969 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
970 | sub $16, %r11 | |
971 | jbe LABEL(strcmp_exitz) | |
972 | #endif | |
973 | add $16, %rdx | |
974 | jmp LABEL(loop_ashr_7_use) | |
975 | ||
976 | .p2align 4 | |
977 | LABEL(nibble_ashr_7_use): | |
978 | sub $0x1000, %r10 | |
979 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 980 | psrldq $7, D(%xmm0) |
d9a4d2ab UD |
981 | pcmpistri $0x3a,%xmm0, %xmm0 |
982 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
983 | cmp %r11, %rcx | |
984 | jae LABEL(nibble_ashr_exit_use) | |
985 | #endif | |
986 | cmp $8, %ecx | |
987 | ja LABEL(nibble_ashr_7_restart_use) | |
988 | ||
989 | jmp LABEL(nibble_ashr_exit_use) | |
990 | ||
991 | /* | |
992 | * The following cases will be handled by ashr_8 | |
993 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
994 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 | |
995 | */ | |
996 | .p2align 4 | |
997 | LABEL(ashr_8): | |
618280a1 | 998 | pslldq $8, D(%xmm2) |
d9a4d2ab | 999 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1000 | pcmpeqb %xmm1, D(%xmm2) |
1001 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1002 | pmovmskb %xmm2, %r9d |
1003 | shr %cl, %edx | |
1004 | shr %cl, %r9d | |
1005 | sub %r9d, %edx | |
1006 | jnz LABEL(less32bytes) | |
1007 | movdqa (%rdi), %xmm3 | |
1008 | ||
1009 | UPDATE_STRNCMP_COUNTER | |
1010 | ||
d9a4d2ab UD |
1011 | mov $16, %rcx /* index for loads */ |
1012 | mov $8, %r9d /* byte position left over from less32bytes case */ | |
1013 | /* | |
1014 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1015 | * When %r10 goes positive we have crossed a page boundary and | |
1016 | * need to do a nibble. | |
1017 | */ | |
1018 | lea 8(%rdi), %r10 | |
1019 | and $0xfff, %r10 /* offset into 4K page */ | |
1020 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1021 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1022 | ||
1023 | .p2align 4 | |
1024 | LABEL(loop_ashr_8_use): | |
1025 | add $16, %r10 | |
1026 | jg LABEL(nibble_ashr_8_use) | |
1027 | ||
1028 | LABEL(nibble_ashr_8_restart_use): | |
1029 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1030 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1031 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1032 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1033 | #else | |
1034 | movdqa (%rsi,%rdx), %xmm1 | |
1035 | TOLOWER (%xmm0, %xmm1) | |
1036 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1037 | #endif | |
1038 | jbe LABEL(exit_use) | |
1039 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1040 | sub $16, %r11 | |
1041 | jbe LABEL(strcmp_exitz) | |
1042 | #endif | |
1043 | ||
1044 | add $16, %rdx | |
1045 | add $16, %r10 | |
1046 | jg LABEL(nibble_ashr_8_use) | |
1047 | ||
1048 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1049 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1050 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1051 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1052 | #else | |
1053 | movdqa (%rsi,%rdx), %xmm1 | |
1054 | TOLOWER (%xmm0, %xmm1) | |
1055 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1056 | #endif | |
1057 | jbe LABEL(exit_use) | |
1058 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1059 | sub $16, %r11 | |
1060 | jbe LABEL(strcmp_exitz) | |
1061 | #endif | |
1062 | add $16, %rdx | |
1063 | jmp LABEL(loop_ashr_8_use) | |
1064 | ||
1065 | .p2align 4 | |
1066 | LABEL(nibble_ashr_8_use): | |
1067 | sub $0x1000, %r10 | |
1068 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1069 | psrldq $8, D(%xmm0) |
d9a4d2ab UD |
1070 | pcmpistri $0x3a,%xmm0, %xmm0 |
1071 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1072 | cmp %r11, %rcx | |
1073 | jae LABEL(nibble_ashr_exit_use) | |
1074 | #endif | |
1075 | cmp $7, %ecx | |
1076 | ja LABEL(nibble_ashr_8_restart_use) | |
1077 | ||
1078 | jmp LABEL(nibble_ashr_exit_use) | |
1079 | ||
1080 | /* | |
1081 | * The following cases will be handled by ashr_9 | |
1082 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1083 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 | |
1084 | */ | |
1085 | .p2align 4 | |
1086 | LABEL(ashr_9): | |
618280a1 | 1087 | pslldq $7, D(%xmm2) |
d9a4d2ab | 1088 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1089 | pcmpeqb %xmm1, D(%xmm2) |
1090 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1091 | pmovmskb %xmm2, %r9d |
1092 | shr %cl, %edx | |
1093 | shr %cl, %r9d | |
1094 | sub %r9d, %edx | |
1095 | jnz LABEL(less32bytes) | |
1096 | movdqa (%rdi), %xmm3 | |
1097 | ||
1098 | UPDATE_STRNCMP_COUNTER | |
1099 | ||
d9a4d2ab UD |
1100 | mov $16, %rcx /* index for loads */ |
1101 | mov $9, %r9d /* byte position left over from less32bytes case */ | |
1102 | /* | |
1103 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1104 | * When %r10 goes positive we have crossed a page boundary and | |
1105 | * need to do a nibble. | |
1106 | */ | |
1107 | lea 9(%rdi), %r10 | |
1108 | and $0xfff, %r10 /* offset into 4K page */ | |
1109 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1110 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1111 | ||
1112 | .p2align 4 | |
1113 | LABEL(loop_ashr_9_use): | |
1114 | add $16, %r10 | |
1115 | jg LABEL(nibble_ashr_9_use) | |
1116 | ||
1117 | LABEL(nibble_ashr_9_restart_use): | |
1118 | movdqa (%rdi, %rdx), %xmm0 | |
1119 | ||
618280a1 | 1120 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1121 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1122 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1123 | #else | |
1124 | movdqa (%rsi,%rdx), %xmm1 | |
1125 | TOLOWER (%xmm0, %xmm1) | |
1126 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1127 | #endif | |
1128 | jbe LABEL(exit_use) | |
1129 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1130 | sub $16, %r11 | |
1131 | jbe LABEL(strcmp_exitz) | |
1132 | #endif | |
1133 | ||
1134 | add $16, %rdx | |
1135 | add $16, %r10 | |
1136 | jg LABEL(nibble_ashr_9_use) | |
1137 | ||
1138 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1139 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1140 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1141 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1142 | #else | |
1143 | movdqa (%rsi,%rdx), %xmm1 | |
1144 | TOLOWER (%xmm0, %xmm1) | |
1145 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1146 | #endif | |
1147 | jbe LABEL(exit_use) | |
1148 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1149 | sub $16, %r11 | |
1150 | jbe LABEL(strcmp_exitz) | |
1151 | #endif | |
1152 | add $16, %rdx | |
1153 | jmp LABEL(loop_ashr_9_use) | |
1154 | ||
1155 | .p2align 4 | |
1156 | LABEL(nibble_ashr_9_use): | |
1157 | sub $0x1000, %r10 | |
1158 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1159 | psrldq $9, D(%xmm0) |
d9a4d2ab UD |
1160 | pcmpistri $0x3a,%xmm0, %xmm0 |
1161 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1162 | cmp %r11, %rcx | |
1163 | jae LABEL(nibble_ashr_exit_use) | |
1164 | #endif | |
1165 | cmp $6, %ecx | |
1166 | ja LABEL(nibble_ashr_9_restart_use) | |
1167 | ||
1168 | jmp LABEL(nibble_ashr_exit_use) | |
1169 | ||
1170 | /* | |
1171 | * The following cases will be handled by ashr_10 | |
1172 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1173 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 | |
1174 | */ | |
1175 | .p2align 4 | |
1176 | LABEL(ashr_10): | |
618280a1 | 1177 | pslldq $6, D(%xmm2) |
d9a4d2ab | 1178 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1179 | pcmpeqb %xmm1, D(%xmm2) |
1180 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1181 | pmovmskb %xmm2, %r9d |
1182 | shr %cl, %edx | |
1183 | shr %cl, %r9d | |
1184 | sub %r9d, %edx | |
1185 | jnz LABEL(less32bytes) | |
1186 | movdqa (%rdi), %xmm3 | |
1187 | ||
1188 | UPDATE_STRNCMP_COUNTER | |
1189 | ||
d9a4d2ab UD |
1190 | mov $16, %rcx /* index for loads */ |
1191 | mov $10, %r9d /* byte position left over from less32bytes case */ | |
1192 | /* | |
1193 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1194 | * When %r10 goes positive we have crossed a page boundary and | |
1195 | * need to do a nibble. | |
1196 | */ | |
1197 | lea 10(%rdi), %r10 | |
1198 | and $0xfff, %r10 /* offset into 4K page */ | |
1199 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1200 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1201 | ||
1202 | .p2align 4 | |
1203 | LABEL(loop_ashr_10_use): | |
1204 | add $16, %r10 | |
1205 | jg LABEL(nibble_ashr_10_use) | |
1206 | ||
1207 | LABEL(nibble_ashr_10_restart_use): | |
1208 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1209 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1210 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1211 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1212 | #else | |
1213 | movdqa (%rsi,%rdx), %xmm1 | |
1214 | TOLOWER (%xmm0, %xmm1) | |
1215 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1216 | #endif | |
1217 | jbe LABEL(exit_use) | |
1218 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1219 | sub $16, %r11 | |
1220 | jbe LABEL(strcmp_exitz) | |
1221 | #endif | |
1222 | ||
1223 | add $16, %rdx | |
1224 | add $16, %r10 | |
1225 | jg LABEL(nibble_ashr_10_use) | |
1226 | ||
1227 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1228 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1229 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1230 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1231 | #else | |
1232 | movdqa (%rsi,%rdx), %xmm1 | |
1233 | TOLOWER (%xmm0, %xmm1) | |
1234 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1235 | #endif | |
1236 | jbe LABEL(exit_use) | |
1237 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1238 | sub $16, %r11 | |
1239 | jbe LABEL(strcmp_exitz) | |
1240 | #endif | |
1241 | add $16, %rdx | |
1242 | jmp LABEL(loop_ashr_10_use) | |
1243 | ||
1244 | .p2align 4 | |
1245 | LABEL(nibble_ashr_10_use): | |
1246 | sub $0x1000, %r10 | |
1247 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1248 | psrldq $10, D(%xmm0) |
d9a4d2ab UD |
1249 | pcmpistri $0x3a,%xmm0, %xmm0 |
1250 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1251 | cmp %r11, %rcx | |
1252 | jae LABEL(nibble_ashr_exit_use) | |
1253 | #endif | |
1254 | cmp $5, %ecx | |
1255 | ja LABEL(nibble_ashr_10_restart_use) | |
1256 | ||
1257 | jmp LABEL(nibble_ashr_exit_use) | |
1258 | ||
1259 | /* | |
1260 | * The following cases will be handled by ashr_11 | |
1261 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1262 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 | |
1263 | */ | |
1264 | .p2align 4 | |
1265 | LABEL(ashr_11): | |
618280a1 | 1266 | pslldq $5, D(%xmm2) |
d9a4d2ab | 1267 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1268 | pcmpeqb %xmm1, D(%xmm2) |
1269 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1270 | pmovmskb %xmm2, %r9d |
1271 | shr %cl, %edx | |
1272 | shr %cl, %r9d | |
1273 | sub %r9d, %edx | |
1274 | jnz LABEL(less32bytes) | |
1275 | movdqa (%rdi), %xmm3 | |
1276 | ||
1277 | UPDATE_STRNCMP_COUNTER | |
1278 | ||
d9a4d2ab UD |
1279 | mov $16, %rcx /* index for loads */ |
1280 | mov $11, %r9d /* byte position left over from less32bytes case */ | |
1281 | /* | |
1282 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1283 | * When %r10 goes positive we have crossed a page boundary and | |
1284 | * need to do a nibble. | |
1285 | */ | |
1286 | lea 11(%rdi), %r10 | |
1287 | and $0xfff, %r10 /* offset into 4K page */ | |
1288 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1289 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1290 | ||
1291 | .p2align 4 | |
1292 | LABEL(loop_ashr_11_use): | |
1293 | add $16, %r10 | |
1294 | jg LABEL(nibble_ashr_11_use) | |
1295 | ||
1296 | LABEL(nibble_ashr_11_restart_use): | |
1297 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1298 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1299 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1300 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1301 | #else | |
1302 | movdqa (%rsi,%rdx), %xmm1 | |
1303 | TOLOWER (%xmm0, %xmm1) | |
1304 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1305 | #endif | |
1306 | jbe LABEL(exit_use) | |
1307 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1308 | sub $16, %r11 | |
1309 | jbe LABEL(strcmp_exitz) | |
1310 | #endif | |
1311 | ||
1312 | add $16, %rdx | |
1313 | add $16, %r10 | |
1314 | jg LABEL(nibble_ashr_11_use) | |
1315 | ||
1316 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1317 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1318 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1319 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1320 | #else | |
1321 | movdqa (%rsi,%rdx), %xmm1 | |
1322 | TOLOWER (%xmm0, %xmm1) | |
1323 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1324 | #endif | |
1325 | jbe LABEL(exit_use) | |
1326 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1327 | sub $16, %r11 | |
1328 | jbe LABEL(strcmp_exitz) | |
1329 | #endif | |
1330 | add $16, %rdx | |
1331 | jmp LABEL(loop_ashr_11_use) | |
1332 | ||
1333 | .p2align 4 | |
1334 | LABEL(nibble_ashr_11_use): | |
1335 | sub $0x1000, %r10 | |
1336 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1337 | psrldq $11, D(%xmm0) |
d9a4d2ab UD |
1338 | pcmpistri $0x3a,%xmm0, %xmm0 |
1339 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1340 | cmp %r11, %rcx | |
1341 | jae LABEL(nibble_ashr_exit_use) | |
1342 | #endif | |
1343 | cmp $4, %ecx | |
1344 | ja LABEL(nibble_ashr_11_restart_use) | |
1345 | ||
1346 | jmp LABEL(nibble_ashr_exit_use) | |
1347 | ||
1348 | /* | |
1349 | * The following cases will be handled by ashr_12 | |
1350 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1351 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 | |
1352 | */ | |
1353 | .p2align 4 | |
1354 | LABEL(ashr_12): | |
618280a1 | 1355 | pslldq $4, D(%xmm2) |
d9a4d2ab | 1356 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1357 | pcmpeqb %xmm1, D(%xmm2) |
1358 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1359 | pmovmskb %xmm2, %r9d |
1360 | shr %cl, %edx | |
1361 | shr %cl, %r9d | |
1362 | sub %r9d, %edx | |
1363 | jnz LABEL(less32bytes) | |
1364 | movdqa (%rdi), %xmm3 | |
1365 | ||
1366 | UPDATE_STRNCMP_COUNTER | |
1367 | ||
d9a4d2ab UD |
1368 | mov $16, %rcx /* index for loads */ |
1369 | mov $12, %r9d /* byte position left over from less32bytes case */ | |
1370 | /* | |
1371 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1372 | * When %r10 goes positive we have crossed a page boundary and | |
1373 | * need to do a nibble. | |
1374 | */ | |
1375 | lea 12(%rdi), %r10 | |
1376 | and $0xfff, %r10 /* offset into 4K page */ | |
1377 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1378 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1379 | ||
1380 | .p2align 4 | |
1381 | LABEL(loop_ashr_12_use): | |
1382 | add $16, %r10 | |
1383 | jg LABEL(nibble_ashr_12_use) | |
1384 | ||
1385 | LABEL(nibble_ashr_12_restart_use): | |
1386 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1387 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1388 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1389 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1390 | #else | |
1391 | movdqa (%rsi,%rdx), %xmm1 | |
1392 | TOLOWER (%xmm0, %xmm1) | |
1393 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1394 | #endif | |
1395 | jbe LABEL(exit_use) | |
1396 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1397 | sub $16, %r11 | |
1398 | jbe LABEL(strcmp_exitz) | |
1399 | #endif | |
1400 | ||
1401 | add $16, %rdx | |
1402 | add $16, %r10 | |
1403 | jg LABEL(nibble_ashr_12_use) | |
1404 | ||
1405 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1406 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1407 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1408 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1409 | #else | |
1410 | movdqa (%rsi,%rdx), %xmm1 | |
1411 | TOLOWER (%xmm0, %xmm1) | |
1412 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1413 | #endif | |
1414 | jbe LABEL(exit_use) | |
1415 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1416 | sub $16, %r11 | |
1417 | jbe LABEL(strcmp_exitz) | |
1418 | #endif | |
1419 | add $16, %rdx | |
1420 | jmp LABEL(loop_ashr_12_use) | |
1421 | ||
1422 | .p2align 4 | |
1423 | LABEL(nibble_ashr_12_use): | |
1424 | sub $0x1000, %r10 | |
1425 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1426 | psrldq $12, D(%xmm0) |
d9a4d2ab UD |
1427 | pcmpistri $0x3a,%xmm0, %xmm0 |
1428 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1429 | cmp %r11, %rcx | |
1430 | jae LABEL(nibble_ashr_exit_use) | |
1431 | #endif | |
1432 | cmp $3, %ecx | |
1433 | ja LABEL(nibble_ashr_12_restart_use) | |
1434 | ||
1435 | jmp LABEL(nibble_ashr_exit_use) | |
1436 | ||
1437 | /* | |
1438 | * The following cases will be handled by ashr_13 | |
1439 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1440 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 | |
1441 | */ | |
1442 | .p2align 4 | |
1443 | LABEL(ashr_13): | |
618280a1 | 1444 | pslldq $3, D(%xmm2) |
d9a4d2ab | 1445 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1446 | pcmpeqb %xmm1, D(%xmm2) |
1447 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1448 | pmovmskb %xmm2, %r9d |
1449 | shr %cl, %edx | |
1450 | shr %cl, %r9d | |
1451 | sub %r9d, %edx | |
1452 | jnz LABEL(less32bytes) | |
1453 | movdqa (%rdi), %xmm3 | |
1454 | ||
1455 | UPDATE_STRNCMP_COUNTER | |
1456 | ||
d9a4d2ab UD |
1457 | mov $16, %rcx /* index for loads */ |
1458 | mov $13, %r9d /* byte position left over from less32bytes case */ | |
1459 | /* | |
1460 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1461 | * When %r10 goes positive we have crossed a page boundary and | |
1462 | * need to do a nibble. | |
1463 | */ | |
1464 | lea 13(%rdi), %r10 | |
1465 | and $0xfff, %r10 /* offset into 4K page */ | |
1466 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1467 | ||
1468 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1469 | ||
1470 | .p2align 4 | |
1471 | LABEL(loop_ashr_13_use): | |
1472 | add $16, %r10 | |
1473 | jg LABEL(nibble_ashr_13_use) | |
1474 | ||
1475 | LABEL(nibble_ashr_13_restart_use): | |
1476 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1477 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1478 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1479 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1480 | #else | |
1481 | movdqa (%rsi,%rdx), %xmm1 | |
1482 | TOLOWER (%xmm0, %xmm1) | |
1483 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1484 | #endif | |
1485 | jbe LABEL(exit_use) | |
1486 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1487 | sub $16, %r11 | |
1488 | jbe LABEL(strcmp_exitz) | |
1489 | #endif | |
1490 | ||
1491 | add $16, %rdx | |
1492 | add $16, %r10 | |
1493 | jg LABEL(nibble_ashr_13_use) | |
1494 | ||
1495 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1496 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1497 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1498 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1499 | #else | |
1500 | movdqa (%rsi,%rdx), %xmm1 | |
1501 | TOLOWER (%xmm0, %xmm1) | |
1502 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1503 | #endif | |
1504 | jbe LABEL(exit_use) | |
1505 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1506 | sub $16, %r11 | |
1507 | jbe LABEL(strcmp_exitz) | |
1508 | #endif | |
1509 | add $16, %rdx | |
1510 | jmp LABEL(loop_ashr_13_use) | |
1511 | ||
1512 | .p2align 4 | |
1513 | LABEL(nibble_ashr_13_use): | |
1514 | sub $0x1000, %r10 | |
1515 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1516 | psrldq $13, D(%xmm0) |
d9a4d2ab UD |
1517 | pcmpistri $0x3a,%xmm0, %xmm0 |
1518 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1519 | cmp %r11, %rcx | |
1520 | jae LABEL(nibble_ashr_exit_use) | |
1521 | #endif | |
1522 | cmp $2, %ecx | |
1523 | ja LABEL(nibble_ashr_13_restart_use) | |
1524 | ||
1525 | jmp LABEL(nibble_ashr_exit_use) | |
1526 | ||
1527 | /* | |
1528 | * The following cases will be handled by ashr_14 | |
1529 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1530 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 | |
1531 | */ | |
1532 | .p2align 4 | |
1533 | LABEL(ashr_14): | |
618280a1 | 1534 | pslldq $2, D(%xmm2) |
d9a4d2ab | 1535 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1536 | pcmpeqb %xmm1, D(%xmm2) |
1537 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1538 | pmovmskb %xmm2, %r9d |
1539 | shr %cl, %edx | |
1540 | shr %cl, %r9d | |
1541 | sub %r9d, %edx | |
1542 | jnz LABEL(less32bytes) | |
1543 | movdqa (%rdi), %xmm3 | |
1544 | ||
1545 | UPDATE_STRNCMP_COUNTER | |
1546 | ||
d9a4d2ab UD |
1547 | mov $16, %rcx /* index for loads */ |
1548 | mov $14, %r9d /* byte position left over from less32bytes case */ | |
1549 | /* | |
1550 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1551 | * When %r10 goes positive we have crossed a page boundary and | |
1552 | * need to do a nibble. | |
1553 | */ | |
1554 | lea 14(%rdi), %r10 | |
1555 | and $0xfff, %r10 /* offset into 4K page */ | |
1556 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1557 | ||
1558 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1559 | ||
1560 | .p2align 4 | |
1561 | LABEL(loop_ashr_14_use): | |
1562 | add $16, %r10 | |
1563 | jg LABEL(nibble_ashr_14_use) | |
1564 | ||
1565 | LABEL(nibble_ashr_14_restart_use): | |
1566 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1567 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1568 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1569 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1570 | #else | |
1571 | movdqa (%rsi,%rdx), %xmm1 | |
1572 | TOLOWER (%xmm0, %xmm1) | |
1573 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1574 | #endif | |
1575 | jbe LABEL(exit_use) | |
1576 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1577 | sub $16, %r11 | |
1578 | jbe LABEL(strcmp_exitz) | |
1579 | #endif | |
1580 | ||
1581 | add $16, %rdx | |
1582 | add $16, %r10 | |
1583 | jg LABEL(nibble_ashr_14_use) | |
1584 | ||
1585 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1586 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1587 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1588 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1589 | #else | |
1590 | movdqa (%rsi,%rdx), %xmm1 | |
1591 | TOLOWER (%xmm0, %xmm1) | |
1592 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1593 | #endif | |
1594 | jbe LABEL(exit_use) | |
1595 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1596 | sub $16, %r11 | |
1597 | jbe LABEL(strcmp_exitz) | |
1598 | #endif | |
1599 | add $16, %rdx | |
1600 | jmp LABEL(loop_ashr_14_use) | |
1601 | ||
1602 | .p2align 4 | |
1603 | LABEL(nibble_ashr_14_use): | |
1604 | sub $0x1000, %r10 | |
1605 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1606 | psrldq $14, D(%xmm0) |
d9a4d2ab UD |
1607 | pcmpistri $0x3a,%xmm0, %xmm0 |
1608 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1609 | cmp %r11, %rcx | |
1610 | jae LABEL(nibble_ashr_exit_use) | |
1611 | #endif | |
1612 | cmp $1, %ecx | |
1613 | ja LABEL(nibble_ashr_14_restart_use) | |
1614 | ||
1615 | jmp LABEL(nibble_ashr_exit_use) | |
1616 | ||
1617 | /* | |
1618 | * The following cases will be handled by ashr_15 | |
1619 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
1620 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 | |
1621 | */ | |
1622 | .p2align 4 | |
1623 | LABEL(ashr_15): | |
618280a1 | 1624 | pslldq $1, D(%xmm2) |
d9a4d2ab | 1625 | TOLOWER (%xmm1, %xmm2) |
618280a1 UD |
1626 | pcmpeqb %xmm1, D(%xmm2) |
1627 | psubb %xmm0, D(%xmm2) | |
d9a4d2ab UD |
1628 | pmovmskb %xmm2, %r9d |
1629 | shr %cl, %edx | |
1630 | shr %cl, %r9d | |
1631 | sub %r9d, %edx | |
1632 | jnz LABEL(less32bytes) | |
1633 | ||
1634 | movdqa (%rdi), %xmm3 | |
1635 | ||
1636 | UPDATE_STRNCMP_COUNTER | |
1637 | ||
d9a4d2ab UD |
1638 | mov $16, %rcx /* index for loads */ |
1639 | mov $15, %r9d /* byte position left over from less32bytes case */ | |
1640 | /* | |
1641 | * Setup %r10 value allows us to detect crossing a page boundary. | |
1642 | * When %r10 goes positive we have crossed a page boundary and | |
1643 | * need to do a nibble. | |
1644 | */ | |
1645 | lea 15(%rdi), %r10 | |
1646 | and $0xfff, %r10 /* offset into 4K page */ | |
1647 | ||
1648 | sub $0x1000, %r10 /* subtract 4K pagesize */ | |
1649 | ||
1650 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
1651 | ||
1652 | .p2align 4 | |
1653 | LABEL(loop_ashr_15_use): | |
1654 | add $16, %r10 | |
1655 | jg LABEL(nibble_ashr_15_use) | |
1656 | ||
1657 | LABEL(nibble_ashr_15_restart_use): | |
1658 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1659 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1660 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1661 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1662 | #else | |
1663 | movdqa (%rsi,%rdx), %xmm1 | |
1664 | TOLOWER (%xmm0, %xmm1) | |
1665 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1666 | #endif | |
1667 | jbe LABEL(exit_use) | |
1668 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1669 | sub $16, %r11 | |
1670 | jbe LABEL(strcmp_exitz) | |
1671 | #endif | |
1672 | ||
1673 | add $16, %rdx | |
1674 | add $16, %r10 | |
1675 | jg LABEL(nibble_ashr_15_use) | |
1676 | ||
1677 | movdqa (%rdi, %rdx), %xmm0 | |
618280a1 | 1678 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
d9a4d2ab UD |
1679 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1680 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 | |
1681 | #else | |
1682 | movdqa (%rsi,%rdx), %xmm1 | |
1683 | TOLOWER (%xmm0, %xmm1) | |
1684 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1685 | #endif | |
1686 | jbe LABEL(exit_use) | |
1687 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1688 | sub $16, %r11 | |
1689 | jbe LABEL(strcmp_exitz) | |
1690 | #endif | |
1691 | add $16, %rdx | |
1692 | jmp LABEL(loop_ashr_15_use) | |
1693 | ||
1694 | .p2align 4 | |
1695 | LABEL(nibble_ashr_15_use): | |
1696 | sub $0x1000, %r10 | |
1697 | movdqa -16(%rdi, %rdx), %xmm0 | |
618280a1 | 1698 | psrldq $15, D(%xmm0) |
d9a4d2ab UD |
1699 | pcmpistri $0x3a,%xmm0, %xmm0 |
1700 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1701 | cmp %r11, %rcx | |
1702 | jae LABEL(nibble_ashr_exit_use) | |
1703 | #endif | |
1704 | cmp $0, %ecx | |
1705 | ja LABEL(nibble_ashr_15_restart_use) | |
1706 | ||
1707 | LABEL(nibble_ashr_exit_use): | |
1708 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L | |
1709 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
1710 | #else | |
1711 | movdqa (%rsi,%rdx), %xmm1 | |
1712 | TOLOWER (%xmm0, %xmm1) | |
1713 | pcmpistri $0x1a, %xmm1, %xmm0 | |
1714 | #endif | |
1715 | .p2align 4 | |
1716 | LABEL(exit_use): | |
1717 | jnc LABEL(strcmp_exitz) | |
1718 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1719 | sub %rcx, %r11 | |
1720 | jbe LABEL(strcmp_exitz) | |
1721 | #endif | |
1722 | add %rcx, %rdx | |
1723 | lea -16(%rdi, %r9), %rdi | |
1724 | movzbl (%rdi, %rdx), %eax | |
1725 | movzbl (%rsi, %rdx), %edx | |
1726 | test %r8d, %r8d | |
1727 | jz LABEL(ret_use) | |
1728 | xchg %eax, %edx | |
1729 | LABEL(ret_use): | |
1730 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1731 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx | |
1732 | movl (%rcx,%rdx,4), %edx | |
1733 | movl (%rcx,%rax,4), %eax | |
1734 | #endif | |
1735 | ||
1736 | sub %edx, %eax | |
1737 | ret | |
1738 | ||
1739 | LABEL(less32bytes): | |
1740 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ | |
1741 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ | |
1742 | test %r8d, %r8d | |
1743 | jz LABEL(ret) | |
1744 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ | |
1745 | ||
1746 | .p2align 4 | |
1747 | LABEL(ret): | |
1748 | LABEL(less16bytes): | |
1749 | bsf %rdx, %rdx /* find and store bit index in %rdx */ | |
1750 | ||
1751 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | |
1752 | sub %rdx, %r11 | |
1753 | jbe LABEL(strcmp_exitz) | |
1754 | #endif | |
1755 | movzbl (%rsi, %rdx), %ecx | |
1756 | movzbl (%rdi, %rdx), %eax | |
1757 | ||
1758 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1759 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1760 | movl (%rdx,%rcx,4), %ecx | |
1761 | movl (%rdx,%rax,4), %eax | |
1762 | #endif | |
1763 | ||
1764 | sub %ecx, %eax | |
1765 | ret | |
1766 | ||
1767 | LABEL(strcmp_exitz): | |
1768 | xor %eax, %eax | |
1769 | ret | |
1770 | ||
1771 | .p2align 4 | |
1772 | // XXX Same as code above | |
1773 | LABEL(Byte0): | |
1774 | movzx (%rsi), %ecx | |
1775 | movzx (%rdi), %eax | |
1776 | ||
1777 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | |
1778 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | |
1779 | movl (%rdx,%rcx,4), %ecx | |
1780 | movl (%rdx,%rax,4), %eax | |
1781 | #endif | |
1782 | ||
1783 | sub %ecx, %eax | |
1784 | ret | |
1785 | cfi_endproc | |
1786 | .size STRCMP_SSE42, .-STRCMP_SSE42 | |
1787 | ||
1788 | #undef UCLOW_reg | |
1789 | #undef UCHIGH_reg | |
1790 | #undef LCQWORD_reg | |
1791 | #undef TOLOWER | |
1792 | ||
1793 | /* Put all SSE 4.2 functions together. */ | |
1794 | .section .rodata.SECTION,"a",@progbits | |
1795 | .p2align 3 | |
1796 | LABEL(unaligned_table): | |
1797 | .int LABEL(ashr_1) - LABEL(unaligned_table) | |
1798 | .int LABEL(ashr_2) - LABEL(unaligned_table) | |
1799 | .int LABEL(ashr_3) - LABEL(unaligned_table) | |
1800 | .int LABEL(ashr_4) - LABEL(unaligned_table) | |
1801 | .int LABEL(ashr_5) - LABEL(unaligned_table) | |
1802 | .int LABEL(ashr_6) - LABEL(unaligned_table) | |
1803 | .int LABEL(ashr_7) - LABEL(unaligned_table) | |
1804 | .int LABEL(ashr_8) - LABEL(unaligned_table) | |
1805 | .int LABEL(ashr_9) - LABEL(unaligned_table) | |
1806 | .int LABEL(ashr_10) - LABEL(unaligned_table) | |
1807 | .int LABEL(ashr_11) - LABEL(unaligned_table) | |
1808 | .int LABEL(ashr_12) - LABEL(unaligned_table) | |
1809 | .int LABEL(ashr_13) - LABEL(unaligned_table) | |
1810 | .int LABEL(ashr_14) - LABEL(unaligned_table) | |
1811 | .int LABEL(ashr_15) - LABEL(unaligned_table) | |
1812 | .int LABEL(ashr_0) - LABEL(unaligned_table) | |
1813 | ||
1814 | #undef LABEL | |
1815 | #undef GLABEL | |
1816 | #undef SECTION | |
618280a1 UD |
1817 | #undef movdqa |
1818 | #undef movdqu | |
1819 | #undef pmovmskb | |
1820 | #undef pcmpistri | |
1821 | #undef psubb | |
1822 | #undef pcmpeqb | |
1823 | #undef psrldq | |
1824 | #undef pslldq | |
1825 | #undef palignr | |
1826 | #undef pxor | |
1827 | #undef D |