]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strchr-avx2.S
47bc3c99491b8a14ccb937e650647815e0bb4e09
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strchr-avx2.S
1 /* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef STRCHR
24 # define STRCHR __strchr_avx2
25 # endif
26
27 # ifdef USE_AS_WCSCHR
28 # define VPBROADCAST vpbroadcastd
29 # define VPCMPEQ vpcmpeqd
30 # define CHAR_REG esi
31 # else
32 # define VPBROADCAST vpbroadcastb
33 # define VPCMPEQ vpcmpeqb
34 # define CHAR_REG sil
35 # endif
36
37 # ifndef VZEROUPPER
38 # define VZEROUPPER vzeroupper
39 # endif
40
41 # define VEC_SIZE 32
42
43 .section .text.avx,"ax",@progbits
44 ENTRY (STRCHR)
45 movl %edi, %ecx
46 /* Broadcast CHAR to YMM0. */
47 vmovd %esi, %xmm0
48 vpxor %xmm9, %xmm9, %xmm9
49 VPBROADCAST %xmm0, %ymm0
50 /* Check if we may cross page boundary with one vector load. */
51 andl $(2 * VEC_SIZE - 1), %ecx
52 cmpl $VEC_SIZE, %ecx
53 ja L(cros_page_boundary)
54
55 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
56 null byte. */
57 vmovdqu (%rdi), %ymm8
58 VPCMPEQ %ymm8, %ymm0, %ymm1
59 VPCMPEQ %ymm8, %ymm9, %ymm2
60 vpor %ymm1, %ymm2, %ymm1
61 vpmovmskb %ymm1, %eax
62 testl %eax, %eax
63 jnz L(first_vec_x0)
64
65 /* Align data for aligned loads in the loop. */
66 addq $VEC_SIZE, %rdi
67 andl $(VEC_SIZE - 1), %ecx
68 andq $-VEC_SIZE, %rdi
69
70 jmp L(more_4x_vec)
71
72 .p2align 4
73 L(cros_page_boundary):
74 andl $(VEC_SIZE - 1), %ecx
75 andq $-VEC_SIZE, %rdi
76 vmovdqu (%rdi), %ymm8
77 VPCMPEQ %ymm8, %ymm0, %ymm1
78 VPCMPEQ %ymm8, %ymm9, %ymm2
79 vpor %ymm1, %ymm2, %ymm1
80 vpmovmskb %ymm1, %eax
81 /* Remove the leading bytes. */
82 sarl %cl, %eax
83 testl %eax, %eax
84 jz L(aligned_more)
85 /* Found CHAR or the null byte. */
86 tzcntl %eax, %eax
87 addq %rcx, %rax
88 # ifdef USE_AS_STRCHRNUL
89 addq %rdi, %rax
90 # else
91 xorl %edx, %edx
92 leaq (%rdi, %rax), %rax
93 cmp (%rax), %CHAR_REG
94 cmovne %rdx, %rax
95 # endif
96 VZEROUPPER
97 ret
98
99 .p2align 4
100 L(aligned_more):
101 addq $VEC_SIZE, %rdi
102
103 L(more_4x_vec):
104 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
105 since data is only aligned to VEC_SIZE. */
106 vmovdqa (%rdi), %ymm8
107 VPCMPEQ %ymm8, %ymm0, %ymm1
108 VPCMPEQ %ymm8, %ymm9, %ymm2
109 vpor %ymm1, %ymm2, %ymm1
110 vpmovmskb %ymm1, %eax
111 testl %eax, %eax
112 jnz L(first_vec_x0)
113
114 vmovdqa VEC_SIZE(%rdi), %ymm8
115 VPCMPEQ %ymm8, %ymm0, %ymm1
116 VPCMPEQ %ymm8, %ymm9, %ymm2
117 vpor %ymm1, %ymm2, %ymm1
118 vpmovmskb %ymm1, %eax
119 testl %eax, %eax
120 jnz L(first_vec_x1)
121
122 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
123 VPCMPEQ %ymm8, %ymm0, %ymm1
124 VPCMPEQ %ymm8, %ymm9, %ymm2
125 vpor %ymm1, %ymm2, %ymm1
126 vpmovmskb %ymm1, %eax
127 testl %eax, %eax
128 jnz L(first_vec_x2)
129
130 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
131 VPCMPEQ %ymm8, %ymm0, %ymm1
132 VPCMPEQ %ymm8, %ymm9, %ymm2
133 vpor %ymm1, %ymm2, %ymm1
134 vpmovmskb %ymm1, %eax
135 testl %eax, %eax
136 jnz L(first_vec_x3)
137
138 addq $(VEC_SIZE * 4), %rdi
139
140 /* Align data to 4 * VEC_SIZE. */
141 movq %rdi, %rcx
142 andl $(4 * VEC_SIZE - 1), %ecx
143 andq $-(4 * VEC_SIZE), %rdi
144
145 .p2align 4
146 L(loop_4x_vec):
147 /* Compare 4 * VEC at a time forward. */
148 vmovdqa (%rdi), %ymm5
149 vmovdqa VEC_SIZE(%rdi), %ymm6
150 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
151 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
152
153 VPCMPEQ %ymm5, %ymm0, %ymm1
154 VPCMPEQ %ymm6, %ymm0, %ymm2
155 VPCMPEQ %ymm7, %ymm0, %ymm3
156 VPCMPEQ %ymm8, %ymm0, %ymm4
157
158 VPCMPEQ %ymm5, %ymm9, %ymm5
159 VPCMPEQ %ymm6, %ymm9, %ymm6
160 VPCMPEQ %ymm7, %ymm9, %ymm7
161 VPCMPEQ %ymm8, %ymm9, %ymm8
162
163 vpor %ymm1, %ymm5, %ymm1
164 vpor %ymm2, %ymm6, %ymm2
165 vpor %ymm3, %ymm7, %ymm3
166 vpor %ymm4, %ymm8, %ymm4
167
168 vpor %ymm1, %ymm2, %ymm5
169 vpor %ymm3, %ymm4, %ymm6
170
171 vpor %ymm5, %ymm6, %ymm5
172
173 vpmovmskb %ymm5, %eax
174 testl %eax, %eax
175 jnz L(4x_vec_end)
176
177 addq $(VEC_SIZE * 4), %rdi
178
179 jmp L(loop_4x_vec)
180
181 .p2align 4
182 L(first_vec_x0):
183 /* Found CHAR or the null byte. */
184 tzcntl %eax, %eax
185 # ifdef USE_AS_STRCHRNUL
186 addq %rdi, %rax
187 # else
188 xorl %edx, %edx
189 leaq (%rdi, %rax), %rax
190 cmp (%rax), %CHAR_REG
191 cmovne %rdx, %rax
192 # endif
193 VZEROUPPER
194 ret
195
196 .p2align 4
197 L(first_vec_x1):
198 tzcntl %eax, %eax
199 # ifdef USE_AS_STRCHRNUL
200 addq $VEC_SIZE, %rax
201 addq %rdi, %rax
202 # else
203 xorl %edx, %edx
204 leaq VEC_SIZE(%rdi, %rax), %rax
205 cmp (%rax), %CHAR_REG
206 cmovne %rdx, %rax
207 # endif
208 VZEROUPPER
209 ret
210
211 .p2align 4
212 L(first_vec_x2):
213 tzcntl %eax, %eax
214 # ifdef USE_AS_STRCHRNUL
215 addq $(VEC_SIZE * 2), %rax
216 addq %rdi, %rax
217 # else
218 xorl %edx, %edx
219 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
220 cmp (%rax), %CHAR_REG
221 cmovne %rdx, %rax
222 # endif
223 VZEROUPPER
224 ret
225
226 .p2align 4
227 L(4x_vec_end):
228 vpmovmskb %ymm1, %eax
229 testl %eax, %eax
230 jnz L(first_vec_x0)
231 vpmovmskb %ymm2, %eax
232 testl %eax, %eax
233 jnz L(first_vec_x1)
234 vpmovmskb %ymm3, %eax
235 testl %eax, %eax
236 jnz L(first_vec_x2)
237 vpmovmskb %ymm4, %eax
238 testl %eax, %eax
239 L(first_vec_x3):
240 tzcntl %eax, %eax
241 # ifdef USE_AS_STRCHRNUL
242 addq $(VEC_SIZE * 3), %rax
243 addq %rdi, %rax
244 # else
245 xorl %edx, %edx
246 leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
247 cmp (%rax), %CHAR_REG
248 cmovne %rdx, %rax
249 # endif
250 VZEROUPPER
251 ret
252
253 END (STRCHR)
254 #endif