1 /* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* memcmp/wmemcmp is implemented as:
22 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
24 2. Use overlapping compare to avoid branch.
25 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
27 4. If size is 8 * VEC_SIZE or less, unroll the loop.
28 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
30 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
31 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
32 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
37 # define MEMCMP __memcmp_avx2_movbe
40 # ifdef USE_AS_WMEMCMP
41 # define VPCMPEQ vpcmpeqd
43 # define VPCMPEQ vpcmpeqb
47 # define VZEROUPPER vzeroupper
51 # define VEC_MASK ((1 << VEC_SIZE) - 1)
54 wmemcmp has to use SIGNED comparison for elements.
55 memcmp has to use UNSIGNED comparison for elemnts.
58 .section .text.avx,"ax",@progbits
60 # ifdef USE_AS_WMEMCMP
66 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
68 VPCMPEQ (%rdi), %ymm2, %ymm2
73 cmpq $(VEC_SIZE * 2), %rdx
76 VPCMPEQ %ymm0, %ymm0, %ymm0
77 /* More than 2 * VEC. */
78 cmpq $(VEC_SIZE * 8), %rdx
80 cmpq $(VEC_SIZE * 4), %rdx
83 /* From 4 * VEC to 8 * VEC, inclusively. */
85 VPCMPEQ (%rdi), %ymm1, %ymm1
87 vmovdqu VEC_SIZE(%rsi), %ymm2
88 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
90 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
91 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
93 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
94 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
96 vpand %ymm1, %ymm2, %ymm5
97 vpand %ymm3, %ymm4, %ymm6
98 vpand %ymm5, %ymm6, %ymm5
103 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
104 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
105 vmovdqu (%rsi), %ymm1
106 VPCMPEQ (%rdi), %ymm1, %ymm1
108 vmovdqu VEC_SIZE(%rsi), %ymm2
109 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
110 vpand %ymm2, %ymm1, %ymm5
112 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
113 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
114 vpand %ymm3, %ymm5, %ymm5
116 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
117 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
118 vpand %ymm4, %ymm5, %ymm5
128 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
129 vmovdqu (%rsi), %ymm2
130 VPCMPEQ (%rdi), %ymm2, %ymm2
131 vpmovmskb %ymm2, %eax
136 /* Use overlapping loads to avoid branches. */
137 leaq -VEC_SIZE(%rdi, %rdx), %rdi
138 leaq -VEC_SIZE(%rsi, %rdx), %rsi
139 vmovdqu (%rsi), %ymm2
140 VPCMPEQ (%rdi), %ymm2, %ymm2
141 vpmovmskb %ymm2, %eax
149 /* A byte or int32 is different within 16 or 32 bytes. */
151 # ifdef USE_AS_WMEMCMP
153 movl (%rdi, %rcx), %edx
154 cmpl (%rsi, %rcx), %edx
160 movzbl (%rdi, %rcx), %eax
161 movzbl (%rsi, %rcx), %edx
167 # ifdef USE_AS_WMEMCMP
173 jne L(wmemcmp_return)
178 /* Load as big endian with overlapping movbe to avoid branches. */
183 movbe -4(%rdi, %rdx), %edi
184 movbe -4(%rsi, %rdx), %esi
199 /* Load as big endian to avoid branches. */
206 movb -1(%rdi, %rdx), %al
207 movb -1(%rsi, %rdx), %cl
208 /* Subtraction is okay because the upper 8 bits are zero. */
227 # ifdef USE_AS_WMEMCMP
228 /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
243 /* It is between 8 and 15 bytes. */
246 VPCMPEQ %xmm1, %xmm2, %xmm2
247 vpmovmskb %xmm2, %eax
250 /* Use overlapping loads to avoid branches. */
251 leaq -8(%rdi, %rdx), %rdi
252 leaq -8(%rsi, %rdx), %rsi
255 VPCMPEQ %xmm1, %xmm2, %xmm2
256 vpmovmskb %xmm2, %eax
263 /* From 16 to 31 bytes. No branch when size == 16. */
264 vmovdqu (%rsi), %xmm2
265 VPCMPEQ (%rdi), %xmm2, %xmm2
266 vpmovmskb %xmm2, %eax
270 /* Use overlapping loads to avoid branches. */
271 leaq -16(%rdi, %rdx), %rdi
272 leaq -16(%rsi, %rdx), %rsi
273 vmovdqu (%rsi), %xmm2
274 VPCMPEQ (%rdi), %xmm2, %xmm2
275 vpmovmskb %xmm2, %eax
282 /* More than 8 * VEC. Check the first VEC. */
283 vmovdqu (%rsi), %ymm2
284 VPCMPEQ (%rdi), %ymm2, %ymm2
285 vpmovmskb %ymm2, %eax
289 /* Align the first memory area for aligned loads in the loop.
290 Compute how much the first memory area is misaligned. */
292 andl $(VEC_SIZE - 1), %ecx
293 /* Get the negative of offset for alignment. */
295 /* Adjust the second memory area. */
297 /* Adjust the first memory area which should be aligned now. */
303 /* Compare 4 * VEC at a time forward. */
304 vmovdqu (%rsi), %ymm1
305 VPCMPEQ (%rdi), %ymm1, %ymm1
307 vmovdqu VEC_SIZE(%rsi), %ymm2
308 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
309 vpand %ymm2, %ymm1, %ymm5
311 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
312 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
313 vpand %ymm3, %ymm5, %ymm5
315 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
316 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
317 vpand %ymm4, %ymm5, %ymm5
322 addq $(VEC_SIZE * 4), %rdi
323 addq $(VEC_SIZE * 4), %rsi
325 subq $(VEC_SIZE * 4), %rdx
326 cmpq $(VEC_SIZE * 4), %rdx
329 /* Less than 4 * VEC. */
332 cmpq $(VEC_SIZE * 2), %rdx
336 /* From 2 * VEC to 4 * VEC. */
337 vmovdqu (%rsi), %ymm2
338 VPCMPEQ (%rdi), %ymm2, %ymm2
339 vpmovmskb %ymm2, %eax
345 vmovdqu (%rsi), %ymm2
346 VPCMPEQ (%rdi), %ymm2, %ymm2
347 vpmovmskb %ymm2, %eax
351 /* Use overlapping loads to avoid branches. */
352 leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
353 leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
354 vmovdqu (%rsi), %ymm2
355 VPCMPEQ (%rdi), %ymm2, %ymm2
356 vpmovmskb %ymm2, %eax
362 vmovdqu (%rsi), %ymm2
363 VPCMPEQ (%rdi), %ymm2, %ymm2
364 vpmovmskb %ymm2, %eax
372 vpmovmskb %ymm1, %eax
375 vpmovmskb %ymm2, %eax
378 vpmovmskb %ymm3, %eax
381 vpmovmskb %ymm4, %eax
384 # ifdef USE_AS_WMEMCMP
386 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
387 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
388 jmp L(wmemcmp_return)
390 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
391 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
400 # ifdef USE_AS_WMEMCMP
402 movl VEC_SIZE(%rdi, %rcx), %edx
403 cmpl VEC_SIZE(%rsi, %rcx), %edx
404 jmp L(wmemcmp_return)
406 movzbl VEC_SIZE(%rdi, %rcx), %eax
407 movzbl VEC_SIZE(%rsi, %rcx), %edx
416 # ifdef USE_AS_WMEMCMP
418 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
419 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
420 jmp L(wmemcmp_return)
422 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
423 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx