1 /* memrchr optimized with AVX2.
2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
24 # define VZEROUPPER vzeroupper
29 .section .text.avx,"ax",@progbits
30 ENTRY (__memrchr_avx2)
31 /* Broadcast CHAR to YMM0. */
33 vpbroadcastb %xmm0, %ymm0
35 sub $VEC_SIZE, %RDX_LP
36 jbe L(last_vec_or_less)
40 /* Check the last VEC_SIZE bytes. */
41 vpcmpeqb (%rdi), %ymm0, %ymm1
46 subq $(VEC_SIZE * 4), %rdi
48 andl $(VEC_SIZE - 1), %ecx
51 /* Align data for aligned loads in the loop. */
59 subq $(VEC_SIZE * 4), %rdx
60 jbe L(last_4x_vec_or_less)
62 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
63 since data is only aligned to VEC_SIZE. */
64 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
69 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
74 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
79 vpcmpeqb (%rdi), %ymm0, %ymm4
84 /* Align data to 4 * VEC_SIZE for loop with fewer branches.
85 There are some overlaps with above if data isn't aligned
88 andl $(VEC_SIZE * 4 - 1), %ecx
91 addq $(VEC_SIZE * 4), %rdi
92 addq $(VEC_SIZE * 4), %rdx
93 andq $-(VEC_SIZE * 4), %rdi
98 /* Compare 4 * VEC at a time forward. */
99 subq $(VEC_SIZE * 4), %rdi
100 subq $(VEC_SIZE * 4), %rdx
101 jbe L(last_4x_vec_or_less)
103 vmovdqa (%rdi), %ymm1
104 vmovdqa VEC_SIZE(%rdi), %ymm2
105 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
106 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
108 vpcmpeqb %ymm1, %ymm0, %ymm1
109 vpcmpeqb %ymm2, %ymm0, %ymm2
110 vpcmpeqb %ymm3, %ymm0, %ymm3
111 vpcmpeqb %ymm4, %ymm0, %ymm4
113 vpor %ymm1, %ymm2, %ymm5
114 vpor %ymm3, %ymm4, %ymm6
115 vpor %ymm5, %ymm6, %ymm5
117 vpmovmskb %ymm5, %eax
121 /* There is a match. */
122 vpmovmskb %ymm4, %eax
126 vpmovmskb %ymm3, %eax
130 vpmovmskb %ymm2, %eax
134 vpmovmskb %ymm1, %eax
141 L(last_4x_vec_or_less):
142 addl $(VEC_SIZE * 4), %edx
143 cmpl $(VEC_SIZE * 2), %edx
146 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
147 vpmovmskb %ymm1, %eax
151 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
152 vpmovmskb %ymm2, %eax
156 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
157 vpmovmskb %ymm3, %eax
159 jnz L(last_vec_x1_check)
160 cmpl $(VEC_SIZE * 3), %edx
163 vpcmpeqb (%rdi), %ymm0, %ymm4
164 vpmovmskb %ymm4, %eax
168 subq $(VEC_SIZE * 4), %rdx
177 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
178 vpmovmskb %ymm1, %eax
180 jnz L(last_vec_x3_check)
184 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
185 vpmovmskb %ymm1, %eax
189 subq $(VEC_SIZE * 2), %rdx
192 addl $(VEC_SIZE * 2), %eax
215 addl $(VEC_SIZE * 2), %eax
223 addl $(VEC_SIZE * 3), %eax
228 L(last_vec_x1_check):
230 subq $(VEC_SIZE * 3), %rdx
239 L(last_vec_x3_check):
244 addl $(VEC_SIZE * 3), %eax
257 L(last_vec_or_less_aligned):
260 vpcmpeqb (%rdi), %ymm0, %ymm1
263 /* Support rdx << 32. */
267 vpmovmskb %ymm1, %eax
269 /* Remove the trailing bytes. */
283 /* Check for zero length. */
288 andl $(VEC_SIZE - 1), %ecx
289 jz L(last_vec_or_less_aligned)
294 andq $-VEC_SIZE, %rdi
297 ja L(last_vec_2x_aligned)
299 /* Check the last VEC. */
300 vpcmpeqb (%rdi), %ymm0, %ymm1
301 vpmovmskb %ymm1, %eax
303 /* Remove the leading and trailing bytes. */
322 L(last_vec_2x_aligned):
325 /* Check the last VEC. */
326 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
332 vpmovmskb %ymm1, %eax
334 /* Remove the trailing bytes. */
340 /* Check the second last VEC. */
341 vpcmpeqb (%rdi), %ymm0, %ymm1
345 vpmovmskb %ymm1, %eax
347 /* Remove the leading bytes. Must use unsigned right shift for