]>
Commit | Line | Data |
---|---|---|
5ac7aa1d | 1 | /* memrchr optimized with SSE2. |
581c785b | 2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
5ac7aa1d L |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
5ac7aa1d | 18 | |
ceabdcd1 NG |
19 | #include <isa-level.h> |
20 | ||
21 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation | |
22 | so we need this to build for ISA V2 builds. */ | |
23 | #if ISA_SHOULD_BUILD (2) | |
24 | ||
08af081f NG |
25 | # ifndef MEMRCHR |
26 | # define MEMRCHR __memrchr_sse2 | |
27 | # endif | |
08af081f | 28 | |
ceabdcd1 NG |
29 | # include <sysdep.h> |
30 | # define VEC_SIZE 16 | |
31 | # define PAGE_SIZE 4096 | |
5ac7aa1d | 32 | |
08af081f NG |
33 | .text |
34 | ENTRY_P2ALIGN(MEMRCHR, 6) | |
ceabdcd1 | 35 | # ifdef __ILP32__ |
08af081f NG |
36 | /* Clear upper bits. */ |
37 | mov %RDX_LP, %RDX_LP | |
ceabdcd1 | 38 | # endif |
08af081f NG |
39 | movd %esi, %xmm0 |
40 | ||
41 | /* Get end pointer. */ | |
42 | leaq (%rdx, %rdi), %rcx | |
43 | ||
44 | punpcklbw %xmm0, %xmm0 | |
45 | punpcklwd %xmm0, %xmm0 | |
46 | pshufd $0, %xmm0, %xmm0 | |
47 | ||
48 | /* Check if we can load 1x VEC without cross a page. */ | |
49 | testl $(PAGE_SIZE - VEC_SIZE), %ecx | |
50 | jz L(page_cross) | |
51 | ||
52 | /* NB: This load happens regardless of whether rdx (len) is zero. Since | |
53 | it doesn't cross a page and the standard gurantees any pointer have | |
54 | at least one-valid byte this load must be safe. For the entire | |
55 | history of the x86 memrchr implementation this has been possible so | |
56 | no code "should" be relying on a zero-length check before this load. | |
57 | The zero-length check is moved to the page cross case because it is | |
58 | 1) pretty cold and including it pushes the hot case len <= VEC_SIZE | |
59 | into 2-cache lines. */ | |
60 | movups -(VEC_SIZE)(%rcx), %xmm1 | |
61 | pcmpeqb %xmm0, %xmm1 | |
62 | pmovmskb %xmm1, %eax | |
63 | ||
64 | subq $VEC_SIZE, %rdx | |
65 | ja L(more_1x_vec) | |
66 | L(ret_vec_x0_test): | |
67 | /* Zero-flag set if eax (src) is zero. Destination unchanged if src is | |
68 | zero. */ | |
69 | bsrl %eax, %eax | |
70 | jz L(ret_0) | |
71 | /* Check if the CHAR match is in bounds. Need to truly zero `eax` here | |
72 | if out of bounds. */ | |
73 | addl %edx, %eax | |
74 | jl L(zero_0) | |
75 | /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base | |
76 | ptr. */ | |
77 | addq %rdi, %rax | |
78 | L(ret_0): | |
79 | ret | |
80 | ||
81 | .p2align 4,, 5 | |
82 | L(ret_vec_x0): | |
83 | bsrl %eax, %eax | |
84 | leaq -(VEC_SIZE)(%rcx, %rax), %rax | |
85 | ret | |
86 | ||
87 | .p2align 4,, 2 | |
88 | L(zero_0): | |
89 | xorl %eax, %eax | |
90 | ret | |
91 | ||
92 | ||
93 | .p2align 4,, 8 | |
94 | L(more_1x_vec): | |
95 | testl %eax, %eax | |
96 | jnz L(ret_vec_x0) | |
97 | ||
98 | /* Align rcx (pointer to string). */ | |
99 | decq %rcx | |
100 | andq $-VEC_SIZE, %rcx | |
101 | ||
102 | movq %rcx, %rdx | |
103 | /* NB: We could consistenyl save 1-byte in this pattern with `movaps | |
104 | %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is | |
105 | it adds more frontend uops (even if the moves can be eliminated) and | |
106 | some percentage of the time actual backend uops. */ | |
107 | movaps -(VEC_SIZE)(%rcx), %xmm1 | |
108 | pcmpeqb %xmm0, %xmm1 | |
109 | subq %rdi, %rdx | |
110 | pmovmskb %xmm1, %eax | |
111 | ||
112 | cmpq $(VEC_SIZE * 2), %rdx | |
113 | ja L(more_2x_vec) | |
114 | L(last_2x_vec): | |
115 | subl $VEC_SIZE, %edx | |
116 | jbe L(ret_vec_x0_test) | |
117 | ||
118 | testl %eax, %eax | |
119 | jnz L(ret_vec_x0) | |
120 | ||
121 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 | |
122 | pcmpeqb %xmm0, %xmm1 | |
123 | pmovmskb %xmm1, %eax | |
124 | ||
125 | subl $VEC_SIZE, %edx | |
126 | bsrl %eax, %eax | |
127 | jz L(ret_1) | |
128 | addl %edx, %eax | |
129 | jl L(zero_0) | |
130 | addq %rdi, %rax | |
131 | L(ret_1): | |
132 | ret | |
133 | ||
134 | /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) | |
135 | causes the hot pause (length <= VEC_SIZE) to span multiple cache | |
136 | lines. Naturally aligned % 16 to 8-bytes. */ | |
137 | L(page_cross): | |
138 | /* Zero length check. */ | |
139 | testq %rdx, %rdx | |
140 | jz L(zero_0) | |
141 | ||
142 | leaq -1(%rcx), %r8 | |
143 | andq $-(VEC_SIZE), %r8 | |
144 | ||
145 | movaps (%r8), %xmm1 | |
146 | pcmpeqb %xmm0, %xmm1 | |
147 | pmovmskb %xmm1, %esi | |
148 | /* Shift out negative alignment (because we are starting from endptr and | |
149 | working backwards). */ | |
150 | negl %ecx | |
151 | /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count | |
152 | explicitly. */ | |
153 | andl $(VEC_SIZE - 1), %ecx | |
154 | shl %cl, %esi | |
155 | movzwl %si, %eax | |
156 | leaq (%rdi, %rdx), %rcx | |
157 | cmpq %rdi, %r8 | |
158 | ja L(more_1x_vec) | |
159 | subl $VEC_SIZE, %edx | |
160 | bsrl %eax, %eax | |
161 | jz L(ret_2) | |
162 | addl %edx, %eax | |
163 | jl L(zero_1) | |
164 | addq %rdi, %rax | |
165 | L(ret_2): | |
166 | ret | |
167 | ||
168 | /* Fits in aliging bytes. */ | |
169 | L(zero_1): | |
170 | xorl %eax, %eax | |
171 | ret | |
172 | ||
173 | .p2align 4,, 5 | |
174 | L(ret_vec_x1): | |
175 | bsrl %eax, %eax | |
176 | leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax | |
177 | ret | |
178 | ||
179 | .p2align 4,, 8 | |
180 | L(more_2x_vec): | |
181 | testl %eax, %eax | |
182 | jnz L(ret_vec_x0) | |
183 | ||
184 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 | |
185 | pcmpeqb %xmm0, %xmm1 | |
186 | pmovmskb %xmm1, %eax | |
187 | testl %eax, %eax | |
188 | jnz L(ret_vec_x1) | |
189 | ||
190 | ||
191 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 | |
192 | pcmpeqb %xmm0, %xmm1 | |
193 | pmovmskb %xmm1, %eax | |
194 | ||
195 | subq $(VEC_SIZE * 4), %rdx | |
196 | ja L(more_4x_vec) | |
197 | ||
198 | addl $(VEC_SIZE), %edx | |
199 | jle L(ret_vec_x2_test) | |
200 | ||
201 | L(last_vec): | |
202 | testl %eax, %eax | |
203 | jnz L(ret_vec_x2) | |
204 | ||
205 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 | |
206 | pcmpeqb %xmm0, %xmm1 | |
207 | pmovmskb %xmm1, %eax | |
208 | ||
209 | subl $(VEC_SIZE), %edx | |
210 | bsrl %eax, %eax | |
211 | jz L(ret_3) | |
212 | addl %edx, %eax | |
213 | jl L(zero_2) | |
214 | addq %rdi, %rax | |
215 | L(ret_3): | |
216 | ret | |
217 | ||
218 | .p2align 4,, 6 | |
219 | L(ret_vec_x2_test): | |
220 | bsrl %eax, %eax | |
221 | jz L(zero_2) | |
222 | addl %edx, %eax | |
223 | jl L(zero_2) | |
224 | addq %rdi, %rax | |
225 | ret | |
226 | ||
227 | L(zero_2): | |
228 | xorl %eax, %eax | |
229 | ret | |
230 | ||
231 | ||
232 | .p2align 4,, 5 | |
233 | L(ret_vec_x2): | |
234 | bsrl %eax, %eax | |
235 | leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax | |
236 | ret | |
237 | ||
238 | .p2align 4,, 5 | |
239 | L(ret_vec_x3): | |
240 | bsrl %eax, %eax | |
241 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax | |
242 | ret | |
243 | ||
244 | .p2align 4,, 8 | |
245 | L(more_4x_vec): | |
246 | testl %eax, %eax | |
247 | jnz L(ret_vec_x2) | |
248 | ||
249 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 | |
250 | pcmpeqb %xmm0, %xmm1 | |
251 | pmovmskb %xmm1, %eax | |
252 | ||
253 | testl %eax, %eax | |
254 | jnz L(ret_vec_x3) | |
255 | ||
256 | addq $-(VEC_SIZE * 4), %rcx | |
257 | cmpq $(VEC_SIZE * 4), %rdx | |
258 | jbe L(last_4x_vec) | |
259 | ||
260 | /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end | |
261 | keeping the code from spilling to the next cache line. */ | |
262 | addq $(VEC_SIZE * 4 - 1), %rcx | |
263 | andq $-(VEC_SIZE * 4), %rcx | |
264 | leaq (VEC_SIZE * 4)(%rdi), %rdx | |
265 | andq $-(VEC_SIZE * 4), %rdx | |
266 | ||
267 | .p2align 4,, 11 | |
268 | L(loop_4x_vec): | |
269 | movaps (VEC_SIZE * -1)(%rcx), %xmm1 | |
270 | movaps (VEC_SIZE * -2)(%rcx), %xmm2 | |
271 | movaps (VEC_SIZE * -3)(%rcx), %xmm3 | |
272 | movaps (VEC_SIZE * -4)(%rcx), %xmm4 | |
273 | pcmpeqb %xmm0, %xmm1 | |
274 | pcmpeqb %xmm0, %xmm2 | |
275 | pcmpeqb %xmm0, %xmm3 | |
276 | pcmpeqb %xmm0, %xmm4 | |
277 | ||
278 | por %xmm1, %xmm2 | |
279 | por %xmm3, %xmm4 | |
280 | por %xmm2, %xmm4 | |
281 | ||
282 | pmovmskb %xmm4, %esi | |
283 | testl %esi, %esi | |
284 | jnz L(loop_end) | |
285 | ||
286 | addq $-(VEC_SIZE * 4), %rcx | |
287 | cmpq %rdx, %rcx | |
288 | jne L(loop_4x_vec) | |
289 | ||
290 | subl %edi, %edx | |
291 | ||
292 | /* Ends up being 1-byte nop. */ | |
293 | .p2align 4,, 2 | |
294 | L(last_4x_vec): | |
295 | movaps -(VEC_SIZE)(%rcx), %xmm1 | |
296 | pcmpeqb %xmm0, %xmm1 | |
297 | pmovmskb %xmm1, %eax | |
298 | ||
299 | cmpl $(VEC_SIZE * 2), %edx | |
300 | jbe L(last_2x_vec) | |
301 | ||
302 | testl %eax, %eax | |
303 | jnz L(ret_vec_x0) | |
304 | ||
305 | ||
306 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 | |
307 | pcmpeqb %xmm0, %xmm1 | |
308 | pmovmskb %xmm1, %eax | |
309 | ||
310 | testl %eax, %eax | |
311 | jnz L(ret_vec_end) | |
312 | ||
313 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 | |
314 | pcmpeqb %xmm0, %xmm1 | |
315 | pmovmskb %xmm1, %eax | |
316 | ||
317 | subl $(VEC_SIZE * 3), %edx | |
318 | ja L(last_vec) | |
319 | bsrl %eax, %eax | |
320 | jz L(ret_4) | |
321 | addl %edx, %eax | |
322 | jl L(zero_3) | |
323 | addq %rdi, %rax | |
324 | L(ret_4): | |
325 | ret | |
326 | ||
327 | /* Ends up being 1-byte nop. */ | |
328 | .p2align 4,, 3 | |
329 | L(loop_end): | |
330 | pmovmskb %xmm1, %eax | |
331 | sall $16, %eax | |
332 | jnz L(ret_vec_end) | |
333 | ||
334 | pmovmskb %xmm2, %eax | |
335 | testl %eax, %eax | |
336 | jnz L(ret_vec_end) | |
337 | ||
338 | pmovmskb %xmm3, %eax | |
339 | /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) | |
340 | then it won't affect the result in esi (VEC4). If ecx is non-zero | |
341 | then CHAR in VEC3 and bsrq will use that position. */ | |
342 | sall $16, %eax | |
343 | orl %esi, %eax | |
344 | bsrl %eax, %eax | |
345 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax | |
346 | ret | |
5ac7aa1d | 347 | |
08af081f NG |
348 | L(ret_vec_end): |
349 | bsrl %eax, %eax | |
350 | leaq (VEC_SIZE * -2)(%rax, %rcx), %rax | |
351 | ret | |
352 | /* Use in L(last_4x_vec). In the same cache line. This is just a spare | |
353 | aligning bytes. */ | |
354 | L(zero_3): | |
355 | xorl %eax, %eax | |
356 | ret | |
357 | /* 2-bytes from next cache line. */ | |
358 | END(MEMRCHR) | |
ceabdcd1 | 359 | #endif |