]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
30f764c393592178fdc2b7492dfebf96a91a6559
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcmp-avx2-movbe.S
1 /* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 /* memcmp/wmemcmp is implemented as:
22 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
23 to avoid branches.
24 2. Use overlapping compare to avoid branch.
25 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
26 bytes for wmemcmp.
27 4. If size is 8 * VEC_SIZE or less, unroll the loop.
28 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
29 area.
30 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
31 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
32 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
33
34 # include <sysdep.h>
35
36 # ifndef MEMCMP
37 # define MEMCMP __memcmp_avx2_movbe
38 # endif
39
40 # ifdef USE_AS_WMEMCMP
41 # define VPCMPEQ vpcmpeqd
42 # else
43 # define VPCMPEQ vpcmpeqb
44 # endif
45
46 # ifndef VZEROUPPER
47 # define VZEROUPPER vzeroupper
48 # endif
49
50 # define VEC_SIZE 32
51 # define VEC_MASK ((1 << VEC_SIZE) - 1)
52
53 /* Warning!
54 wmemcmp has to use SIGNED comparison for elements.
55 memcmp has to use UNSIGNED comparison for elemnts.
56 */
57
58 .section .text.avx,"ax",@progbits
59 ENTRY (MEMCMP)
60 # ifdef USE_AS_WMEMCMP
61 shl $2, %rdx
62 # endif
63 cmpq $VEC_SIZE, %rdx
64 jb L(less_vec)
65
66 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
67 vmovdqu (%rsi), %ymm2
68 VPCMPEQ (%rdi), %ymm2, %ymm2
69 vpmovmskb %ymm2, %eax
70 subl $VEC_MASK, %eax
71 jnz L(first_vec)
72
73 cmpq $(VEC_SIZE * 2), %rdx
74 jbe L(last_vec)
75
76 VPCMPEQ %ymm0, %ymm0, %ymm0
77 /* More than 2 * VEC. */
78 cmpq $(VEC_SIZE * 8), %rdx
79 ja L(more_8x_vec)
80 cmpq $(VEC_SIZE * 4), %rdx
81 jb L(last_4x_vec)
82
83 /* From 4 * VEC to 8 * VEC, inclusively. */
84 vmovdqu (%rsi), %ymm1
85 VPCMPEQ (%rdi), %ymm1, %ymm1
86
87 vmovdqu VEC_SIZE(%rsi), %ymm2
88 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
89
90 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
91 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
92
93 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
94 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
95
96 vpand %ymm1, %ymm2, %ymm5
97 vpand %ymm3, %ymm4, %ymm6
98 vpand %ymm5, %ymm6, %ymm5
99
100 vptest %ymm0, %ymm5
101 jnc L(4x_vec_end)
102
103 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
104 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
105 vmovdqu (%rsi), %ymm1
106 VPCMPEQ (%rdi), %ymm1, %ymm1
107
108 vmovdqu VEC_SIZE(%rsi), %ymm2
109 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
110 vpand %ymm2, %ymm1, %ymm5
111
112 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
113 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
114 vpand %ymm3, %ymm5, %ymm5
115
116 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
117 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
118 vpand %ymm4, %ymm5, %ymm5
119
120 vptest %ymm0, %ymm5
121 jnc L(4x_vec_end)
122 xorl %eax, %eax
123 VZEROUPPER
124 ret
125
126 .p2align 4
127 L(last_2x_vec):
128 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
129 vmovdqu (%rsi), %ymm2
130 VPCMPEQ (%rdi), %ymm2, %ymm2
131 vpmovmskb %ymm2, %eax
132 subl $VEC_MASK, %eax
133 jnz L(first_vec)
134
135 L(last_vec):
136 /* Use overlapping loads to avoid branches. */
137 leaq -VEC_SIZE(%rdi, %rdx), %rdi
138 leaq -VEC_SIZE(%rsi, %rdx), %rsi
139 vmovdqu (%rsi), %ymm2
140 VPCMPEQ (%rdi), %ymm2, %ymm2
141 vpmovmskb %ymm2, %eax
142 subl $VEC_MASK, %eax
143 jnz L(first_vec)
144 VZEROUPPER
145 ret
146
147 .p2align 4
148 L(first_vec):
149 /* A byte or int32 is different within 16 or 32 bytes. */
150 tzcntl %eax, %ecx
151 # ifdef USE_AS_WMEMCMP
152 xorl %eax, %eax
153 movl (%rdi, %rcx), %edx
154 cmpl (%rsi, %rcx), %edx
155 L(wmemcmp_return):
156 setl %al
157 negl %eax
158 orl $1, %eax
159 # else
160 movzbl (%rdi, %rcx), %eax
161 movzbl (%rsi, %rcx), %edx
162 sub %edx, %eax
163 # endif
164 VZEROUPPER
165 ret
166
167 # ifdef USE_AS_WMEMCMP
168 .p2align 4
169 L(4):
170 xorl %eax, %eax
171 movl (%rdi), %edx
172 cmpl (%rsi), %edx
173 jne L(wmemcmp_return)
174 ret
175 # else
176 .p2align 4
177 L(between_4_7):
178 /* Load as big endian with overlapping movbe to avoid branches. */
179 movbe (%rdi), %eax
180 movbe (%rsi), %ecx
181 shlq $32, %rax
182 shlq $32, %rcx
183 movbe -4(%rdi, %rdx), %edi
184 movbe -4(%rsi, %rdx), %esi
185 orq %rdi, %rax
186 orq %rsi, %rcx
187 subq %rcx, %rax
188 je L(exit)
189 sbbl %eax, %eax
190 orl $1, %eax
191 ret
192
193 .p2align 4
194 L(exit):
195 ret
196
197 .p2align 4
198 L(between_2_3):
199 /* Load as big endian to avoid branches. */
200 movzwl (%rdi), %eax
201 movzwl (%rsi), %ecx
202 shll $8, %eax
203 shll $8, %ecx
204 bswap %eax
205 bswap %ecx
206 movb -1(%rdi, %rdx), %al
207 movb -1(%rsi, %rdx), %cl
208 /* Subtraction is okay because the upper 8 bits are zero. */
209 subl %ecx, %eax
210 ret
211
212 .p2align 4
213 L(1):
214 movzbl (%rdi), %eax
215 movzbl (%rsi), %ecx
216 subl %ecx, %eax
217 ret
218 # endif
219
220 .p2align 4
221 L(zero):
222 xorl %eax, %eax
223 ret
224
225 .p2align 4
226 L(less_vec):
227 # ifdef USE_AS_WMEMCMP
228 /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
229 cmpb $4, %dl
230 je L(4)
231 jb L(zero)
232 # else
233 cmpb $1, %dl
234 je L(1)
235 jb L(zero)
236 cmpb $4, %dl
237 jb L(between_2_3)
238 cmpb $8, %dl
239 jb L(between_4_7)
240 # endif
241 cmpb $16, %dl
242 jae L(between_16_31)
243 /* It is between 8 and 15 bytes. */
244 vmovq (%rdi), %xmm1
245 vmovq (%rsi), %xmm2
246 VPCMPEQ %xmm1, %xmm2, %xmm2
247 vpmovmskb %xmm2, %eax
248 subl $0xffff, %eax
249 jnz L(first_vec)
250 /* Use overlapping loads to avoid branches. */
251 leaq -8(%rdi, %rdx), %rdi
252 leaq -8(%rsi, %rdx), %rsi
253 vmovq (%rdi), %xmm1
254 vmovq (%rsi), %xmm2
255 VPCMPEQ %xmm1, %xmm2, %xmm2
256 vpmovmskb %xmm2, %eax
257 subl $0xffff, %eax
258 jnz L(first_vec)
259 ret
260
261 .p2align 4
262 L(between_16_31):
263 /* From 16 to 31 bytes. No branch when size == 16. */
264 vmovdqu (%rsi), %xmm2
265 VPCMPEQ (%rdi), %xmm2, %xmm2
266 vpmovmskb %xmm2, %eax
267 subl $0xffff, %eax
268 jnz L(first_vec)
269
270 /* Use overlapping loads to avoid branches. */
271 leaq -16(%rdi, %rdx), %rdi
272 leaq -16(%rsi, %rdx), %rsi
273 vmovdqu (%rsi), %xmm2
274 VPCMPEQ (%rdi), %xmm2, %xmm2
275 vpmovmskb %xmm2, %eax
276 subl $0xffff, %eax
277 jnz L(first_vec)
278 ret
279
280 .p2align 4
281 L(more_8x_vec):
282 /* More than 8 * VEC. Check the first VEC. */
283 vmovdqu (%rsi), %ymm2
284 VPCMPEQ (%rdi), %ymm2, %ymm2
285 vpmovmskb %ymm2, %eax
286 subl $VEC_MASK, %eax
287 jnz L(first_vec)
288
289 /* Align the first memory area for aligned loads in the loop.
290 Compute how much the first memory area is misaligned. */
291 movq %rdi, %rcx
292 andl $(VEC_SIZE - 1), %ecx
293 /* Get the negative of offset for alignment. */
294 subq $VEC_SIZE, %rcx
295 /* Adjust the second memory area. */
296 subq %rcx, %rsi
297 /* Adjust the first memory area which should be aligned now. */
298 subq %rcx, %rdi
299 /* Adjust length. */
300 addq %rcx, %rdx
301
302 L(loop_4x_vec):
303 /* Compare 4 * VEC at a time forward. */
304 vmovdqu (%rsi), %ymm1
305 VPCMPEQ (%rdi), %ymm1, %ymm1
306
307 vmovdqu VEC_SIZE(%rsi), %ymm2
308 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
309 vpand %ymm2, %ymm1, %ymm5
310
311 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
312 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
313 vpand %ymm3, %ymm5, %ymm5
314
315 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
316 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
317 vpand %ymm4, %ymm5, %ymm5
318
319 vptest %ymm0, %ymm5
320 jnc L(4x_vec_end)
321
322 addq $(VEC_SIZE * 4), %rdi
323 addq $(VEC_SIZE * 4), %rsi
324
325 subq $(VEC_SIZE * 4), %rdx
326 cmpq $(VEC_SIZE * 4), %rdx
327 jae L(loop_4x_vec)
328
329 /* Less than 4 * VEC. */
330 cmpq $VEC_SIZE, %rdx
331 jbe L(last_vec)
332 cmpq $(VEC_SIZE * 2), %rdx
333 jbe L(last_2x_vec)
334
335 L(last_4x_vec):
336 /* From 2 * VEC to 4 * VEC. */
337 vmovdqu (%rsi), %ymm2
338 VPCMPEQ (%rdi), %ymm2, %ymm2
339 vpmovmskb %ymm2, %eax
340 subl $VEC_MASK, %eax
341 jnz L(first_vec)
342
343 addq $VEC_SIZE, %rdi
344 addq $VEC_SIZE, %rsi
345 vmovdqu (%rsi), %ymm2
346 VPCMPEQ (%rdi), %ymm2, %ymm2
347 vpmovmskb %ymm2, %eax
348 subl $VEC_MASK, %eax
349 jnz L(first_vec)
350
351 /* Use overlapping loads to avoid branches. */
352 leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
353 leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
354 vmovdqu (%rsi), %ymm2
355 VPCMPEQ (%rdi), %ymm2, %ymm2
356 vpmovmskb %ymm2, %eax
357 subl $VEC_MASK, %eax
358 jnz L(first_vec)
359
360 addq $VEC_SIZE, %rdi
361 addq $VEC_SIZE, %rsi
362 vmovdqu (%rsi), %ymm2
363 VPCMPEQ (%rdi), %ymm2, %ymm2
364 vpmovmskb %ymm2, %eax
365 subl $VEC_MASK, %eax
366 jnz L(first_vec)
367 VZEROUPPER
368 ret
369
370 .p2align 4
371 L(4x_vec_end):
372 vpmovmskb %ymm1, %eax
373 subl $VEC_MASK, %eax
374 jnz L(first_vec)
375 vpmovmskb %ymm2, %eax
376 subl $VEC_MASK, %eax
377 jnz L(first_vec_x1)
378 vpmovmskb %ymm3, %eax
379 subl $VEC_MASK, %eax
380 jnz L(first_vec_x2)
381 vpmovmskb %ymm4, %eax
382 subl $VEC_MASK, %eax
383 tzcntl %eax, %ecx
384 # ifdef USE_AS_WMEMCMP
385 xorl %eax, %eax
386 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
387 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
388 jmp L(wmemcmp_return)
389 # else
390 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
391 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
392 sub %edx, %eax
393 # endif
394 VZEROUPPER
395 ret
396
397 .p2align 4
398 L(first_vec_x1):
399 tzcntl %eax, %ecx
400 # ifdef USE_AS_WMEMCMP
401 xorl %eax, %eax
402 movl VEC_SIZE(%rdi, %rcx), %edx
403 cmpl VEC_SIZE(%rsi, %rcx), %edx
404 jmp L(wmemcmp_return)
405 # else
406 movzbl VEC_SIZE(%rdi, %rcx), %eax
407 movzbl VEC_SIZE(%rsi, %rcx), %edx
408 sub %edx, %eax
409 # endif
410 VZEROUPPER
411 ret
412
413 .p2align 4
414 L(first_vec_x2):
415 tzcntl %eax, %ecx
416 # ifdef USE_AS_WMEMCMP
417 xorl %eax, %eax
418 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
419 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
420 jmp L(wmemcmp_return)
421 # else
422 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
423 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
424 sub %edx, %eax
425 # endif
426 VZEROUPPER
427 ret
428 END (MEMCMP)
429 #endif