1 /* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
24 # define STRCHR __strchr_avx2
28 # define VPBROADCAST vpbroadcastd
29 # define VPCMPEQ vpcmpeqd
32 # define VPBROADCAST vpbroadcastb
33 # define VPCMPEQ vpcmpeqb
38 # define VZEROUPPER vzeroupper
43 .section .text.avx,"ax",@progbits
46 /* Broadcast CHAR to YMM0. */
48 vpxor %xmm9, %xmm9, %xmm9
49 VPBROADCAST %xmm0, %ymm0
50 /* Check if we may cross page boundary with one vector load. */
51 andl $(2 * VEC_SIZE - 1), %ecx
53 ja L(cros_page_boundary)
55 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
58 VPCMPEQ %ymm8, %ymm0, %ymm1
59 VPCMPEQ %ymm8, %ymm9, %ymm2
60 vpor %ymm1, %ymm2, %ymm1
65 /* Align data for aligned loads in the loop. */
67 andl $(VEC_SIZE - 1), %ecx
73 L(cros_page_boundary):
74 andl $(VEC_SIZE - 1), %ecx
77 VPCMPEQ %ymm8, %ymm0, %ymm1
78 VPCMPEQ %ymm8, %ymm9, %ymm2
79 vpor %ymm1, %ymm2, %ymm1
81 /* Remove the leading bytes. */
85 /* Found CHAR or the null byte. */
88 # ifdef USE_AS_STRCHRNUL
92 leaq (%rdi, %rax), %rax
104 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
105 since data is only aligned to VEC_SIZE. */
106 vmovdqa (%rdi), %ymm8
107 VPCMPEQ %ymm8, %ymm0, %ymm1
108 VPCMPEQ %ymm8, %ymm9, %ymm2
109 vpor %ymm1, %ymm2, %ymm1
110 vpmovmskb %ymm1, %eax
114 vmovdqa VEC_SIZE(%rdi), %ymm8
115 VPCMPEQ %ymm8, %ymm0, %ymm1
116 VPCMPEQ %ymm8, %ymm9, %ymm2
117 vpor %ymm1, %ymm2, %ymm1
118 vpmovmskb %ymm1, %eax
122 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
123 VPCMPEQ %ymm8, %ymm0, %ymm1
124 VPCMPEQ %ymm8, %ymm9, %ymm2
125 vpor %ymm1, %ymm2, %ymm1
126 vpmovmskb %ymm1, %eax
130 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
131 VPCMPEQ %ymm8, %ymm0, %ymm1
132 VPCMPEQ %ymm8, %ymm9, %ymm2
133 vpor %ymm1, %ymm2, %ymm1
134 vpmovmskb %ymm1, %eax
138 addq $(VEC_SIZE * 4), %rdi
140 /* Align data to 4 * VEC_SIZE. */
142 andl $(4 * VEC_SIZE - 1), %ecx
143 andq $-(4 * VEC_SIZE), %rdi
147 /* Compare 4 * VEC at a time forward. */
148 vmovdqa (%rdi), %ymm5
149 vmovdqa VEC_SIZE(%rdi), %ymm6
150 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
151 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
153 VPCMPEQ %ymm5, %ymm0, %ymm1
154 VPCMPEQ %ymm6, %ymm0, %ymm2
155 VPCMPEQ %ymm7, %ymm0, %ymm3
156 VPCMPEQ %ymm8, %ymm0, %ymm4
158 VPCMPEQ %ymm5, %ymm9, %ymm5
159 VPCMPEQ %ymm6, %ymm9, %ymm6
160 VPCMPEQ %ymm7, %ymm9, %ymm7
161 VPCMPEQ %ymm8, %ymm9, %ymm8
163 vpor %ymm1, %ymm5, %ymm1
164 vpor %ymm2, %ymm6, %ymm2
165 vpor %ymm3, %ymm7, %ymm3
166 vpor %ymm4, %ymm8, %ymm4
168 vpor %ymm1, %ymm2, %ymm5
169 vpor %ymm3, %ymm4, %ymm6
171 vpor %ymm5, %ymm6, %ymm5
173 vpmovmskb %ymm5, %eax
177 addq $(VEC_SIZE * 4), %rdi
183 /* Found CHAR or the null byte. */
185 # ifdef USE_AS_STRCHRNUL
189 leaq (%rdi, %rax), %rax
190 cmp (%rax), %CHAR_REG
199 # ifdef USE_AS_STRCHRNUL
204 leaq VEC_SIZE(%rdi, %rax), %rax
205 cmp (%rax), %CHAR_REG
214 # ifdef USE_AS_STRCHRNUL
215 addq $(VEC_SIZE * 2), %rax
219 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
220 cmp (%rax), %CHAR_REG
228 vpmovmskb %ymm1, %eax
231 vpmovmskb %ymm2, %eax
234 vpmovmskb %ymm3, %eax
237 vpmovmskb %ymm4, %eax
241 # ifdef USE_AS_STRCHRNUL
242 addq $(VEC_SIZE * 3), %rax
246 leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
247 cmp (%rax), %CHAR_REG