[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memrchr-avx2.S

/* memrchr optimized with AVX2.
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#if IS_IN (libc)

# include <sysdep.h>

# ifndef VZEROUPPER
#  define VZEROUPPER	vzeroupper
# endif

# define VEC_SIZE 32

	.section .text.avx,"ax",@progbits
ENTRY (__memrchr_avx2)
	/* Broadcast CHAR to YMM0.  */
	vmovd	%esi, %xmm0
	vpbroadcastb %xmm0, %ymm0

	sub	$VEC_SIZE, %RDX_LP
	jbe	L(last_vec_or_less)

	add	%RDX_LP, %RDI_LP

	/* Check the last VEC_SIZE bytes.  */
	vpcmpeqb (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x0)

	subq	$(VEC_SIZE * 4), %rdi
	movl	%edi, %ecx
	andl	$(VEC_SIZE - 1), %ecx
	jz	L(aligned_more)

	/* Align data for aligned loads in the loop.  */
	addq	$VEC_SIZE, %rdi
	addq	$VEC_SIZE, %rdx
	andq	$-VEC_SIZE, %rdi
	subq	%rcx, %rdx

	.p2align 4
L(aligned_more):
	subq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_4x_vec_or_less)

	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
	   since data is only aligned to VEC_SIZE.  */
	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x3)

	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
	vpmovmskb %ymm2, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x2)

	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
	vpmovmskb %ymm3, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x1)

	vpcmpeqb (%rdi), %ymm0, %ymm4
	vpmovmskb %ymm4, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x0)

	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
	   There are some overlaps with above if data isn't aligned
	   to 4 * VEC_SIZE.  */
	movl	%edi, %ecx
	andl	$(VEC_SIZE * 4 - 1), %ecx
	jz	L(loop_4x_vec)

	addq	$(VEC_SIZE * 4), %rdi
	addq	$(VEC_SIZE * 4), %rdx
	andq	$-(VEC_SIZE * 4), %rdi
	subq	%rcx, %rdx

	.p2align 4
L(loop_4x_vec):
	/* Compare 4 * VEC at a time forward.  */
	subq	$(VEC_SIZE * 4), %rdi
	subq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_4x_vec_or_less)

	vmovdqa	(%rdi), %ymm1
	vmovdqa	VEC_SIZE(%rdi), %ymm2
	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4

	vpcmpeqb %ymm1, %ymm0, %ymm1
	vpcmpeqb %ymm2, %ymm0, %ymm2
	vpcmpeqb %ymm3, %ymm0, %ymm3
	vpcmpeqb %ymm4, %ymm0, %ymm4

	vpor	%ymm1, %ymm2, %ymm5
	vpor	%ymm3, %ymm4, %ymm6
	vpor	%ymm5, %ymm6, %ymm5

	vpmovmskb %ymm5, %eax
	testl	%eax, %eax
	jz	L(loop_4x_vec)

	/* There is a match.  */
	vpmovmskb %ymm4, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x3)

	vpmovmskb %ymm3, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x2)

	vpmovmskb %ymm2, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x1)

	vpmovmskb %ymm1, %eax
	bsrl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_4x_vec_or_less):
	addl	$(VEC_SIZE * 4), %edx
	cmpl	$(VEC_SIZE * 2), %edx
	jbe	L(last_2x_vec)

	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x3)

	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
	vpmovmskb %ymm2, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x2)

	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
	vpmovmskb %ymm3, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x1_check)
	cmpl	$(VEC_SIZE * 3), %edx
	jbe	L(zero)

	vpcmpeqb (%rdi), %ymm0, %ymm4
	vpmovmskb %ymm4, %eax
	testl	%eax, %eax
	jz	L(zero)
	bsrl	%eax, %eax
	subq	$(VEC_SIZE * 4), %rdx
	addq	%rax, %rdx
	jl	L(zero)
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_2x_vec):
	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(last_vec_x3_check)
	cmpl	$VEC_SIZE, %edx
	jbe	L(zero)

	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jz	L(zero)
	bsrl	%eax, %eax
	subq	$(VEC_SIZE * 2), %rdx
	addq	%rax, %rdx
	jl	L(zero)
	addl	$(VEC_SIZE * 2), %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_x0):
	bsrl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_x1):
	bsrl	%eax, %eax
	addl	$VEC_SIZE, %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_x2):
	bsrl	%eax, %eax
	addl	$(VEC_SIZE * 2), %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_x3):
	bsrl	%eax, %eax
	addl	$(VEC_SIZE * 3), %eax
	addq	%rdi, %rax
	ret

	.p2align 4
L(last_vec_x1_check):
	bsrl	%eax, %eax
	subq	$(VEC_SIZE * 3), %rdx
	addq	%rax, %rdx
	jl	L(zero)
	addl	$VEC_SIZE, %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_x3_check):
	bsrl	%eax, %eax
	subq	$VEC_SIZE, %rdx
	addq	%rax, %rdx
	jl	L(zero)
	addl	$(VEC_SIZE * 3), %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(zero):
	VZEROUPPER
L(null):
	xorl	%eax, %eax
	ret

	.p2align 4
L(last_vec_or_less_aligned):
	movl	%edx, %ecx

	vpcmpeqb (%rdi), %ymm0, %ymm1

	movl	$1, %edx
	/* Support rdx << 32.  */
	salq	%cl, %rdx
	subq	$1, %rdx

	vpmovmskb %ymm1, %eax

	/* Remove the trailing bytes.  */
	andl	%edx, %eax
	testl	%eax, %eax
	jz	L(zero)

	bsrl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_or_less):
	addl	$VEC_SIZE, %edx

	/* Check for zero length.  */
	testl	%edx, %edx
	jz	L(null)

	movl	%edi, %ecx
	andl	$(VEC_SIZE - 1), %ecx
	jz	L(last_vec_or_less_aligned)

	movl	%ecx, %esi
	movl	%ecx, %r8d
	addl	%edx, %esi
	andq	$-VEC_SIZE, %rdi

	subl	$VEC_SIZE, %esi
	ja	L(last_vec_2x_aligned)

	/* Check the last VEC.  */
	vpcmpeqb (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax

	/* Remove the leading and trailing bytes.  */
	sarl	%cl, %eax
	movl	%edx, %ecx

	movl	$1, %edx
	sall	%cl, %edx
	subl	$1, %edx

	andl	%edx, %eax
	testl	%eax, %eax
	jz	L(zero)

	bsrl	%eax, %eax
	addq	%rdi, %rax
	addq	%r8, %rax
	VZEROUPPER
	ret

	.p2align 4
L(last_vec_2x_aligned):
	movl	%esi, %ecx

	/* Check the last VEC.  */
	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1

	movl	$1, %edx
	sall	%cl, %edx
	subl	$1, %edx

	vpmovmskb %ymm1, %eax

	/* Remove the trailing bytes.  */
	andl	%edx, %eax

	testl	%eax, %eax
	jnz	L(last_vec_x1)

	/* Check the second last VEC.  */
	vpcmpeqb (%rdi), %ymm0, %ymm1

	movl	%r8d, %ecx

	vpmovmskb %ymm1, %eax

	/* Remove the leading bytes.  Must use unsigned right shift for
	   bsrl below.  */
	shrl	%cl, %eax
	testl	%eax, %eax
	jz	L(zero)

	bsrl	%eax, %eax
	addq	%rdi, %rax
	addq	%r8, %rax
	VZEROUPPER
	ret
END (__memrchr_avx2)
#endif
Commit	Line	Data
5ac7aa1d	1	/* memrchr optimized with AVX2.
688903eb	2	Copyright (C) 2017-2018 Free Software Foundation, Inc.
5ac7aa1d L	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#if IS_IN (libc)
	20
	21	# include <sysdep.h>
	22
	23	# ifndef VZEROUPPER
	24	# define VZEROUPPER vzeroupper
	25	# endif
	26
	27	# define VEC_SIZE 32
	28
	29	.section .text.avx,"ax",@progbits
	30	ENTRY (__memrchr_avx2)
	31	/* Broadcast CHAR to YMM0. */
	32	vmovd %esi, %xmm0
	33	vpbroadcastb %xmm0, %ymm0
	34
eee0a3d0	35	sub $VEC_SIZE, %RDX_LP
5ac7aa1d L	36	jbe L(last_vec_or_less)
5ac7aa1d L	37
eee0a3d0	38	add %RDX_LP, %RDI_LP
5ac7aa1d L	39
	40	/* Check the last VEC_SIZE bytes. */
	41	vpcmpeqb (%rdi), %ymm0, %ymm1
	42	vpmovmskb %ymm1, %eax
	43	testl %eax, %eax
	44	jnz L(last_vec_x0)
	45
	46	subq $(VEC_SIZE * 4), %rdi
	47	movl %edi, %ecx
	48	andl $(VEC_SIZE - 1), %ecx
	49	jz L(aligned_more)
	50
	51	/* Align data for aligned loads in the loop. */
	52	addq $VEC_SIZE, %rdi
	53	addq $VEC_SIZE, %rdx
	54	andq $-VEC_SIZE, %rdi
	55	subq %rcx, %rdx
	56
	57	.p2align 4
	58	L(aligned_more):
	59	subq $(VEC_SIZE * 4), %rdx
	60	jbe L(last_4x_vec_or_less)
	61
	62	/* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
	63	since data is only aligned to VEC_SIZE. */
	64	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	65	vpmovmskb %ymm1, %eax
	66	testl %eax, %eax
	67	jnz L(last_vec_x3)
	68
	69	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
	70	vpmovmskb %ymm2, %eax
	71	testl %eax, %eax
	72	jnz L(last_vec_x2)
	73
	74	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
	75	vpmovmskb %ymm3, %eax
	76	testl %eax, %eax
	77	jnz L(last_vec_x1)
	78
	79	vpcmpeqb (%rdi), %ymm0, %ymm4
	80	vpmovmskb %ymm4, %eax
	81	testl %eax, %eax
	82	jnz L(last_vec_x0)
	83
	84	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
	85	There are some overlaps with above if data isn't aligned
	86	to 4 * VEC_SIZE. */
	87	movl %edi, %ecx
	88	andl $(VEC_SIZE * 4 - 1), %ecx
	89	jz L(loop_4x_vec)
	90
	91	addq $(VEC_SIZE * 4), %rdi
	92	addq $(VEC_SIZE * 4), %rdx
	93	andq $-(VEC_SIZE * 4), %rdi
	94	subq %rcx, %rdx
	95
	96	.p2align 4
	97	L(loop_4x_vec):
	98	/* Compare 4 * VEC at a time forward. */
	99	subq $(VEC_SIZE * 4), %rdi
	100	subq $(VEC_SIZE * 4), %rdx
	101	jbe L(last_4x_vec_or_less)
	102
103	vmovdqa (%rdi), %ymm1
104	vmovdqa VEC_SIZE(%rdi), %ymm2
105	vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
106	vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
107
108	vpcmpeqb %ymm1, %ymm0, %ymm1
109	vpcmpeqb %ymm2, %ymm0, %ymm2
110	vpcmpeqb %ymm3, %ymm0, %ymm3
111	vpcmpeqb %ymm4, %ymm0, %ymm4
112
113	vpor %ymm1, %ymm2, %ymm5
114	vpor %ymm3, %ymm4, %ymm6
115	vpor %ymm5, %ymm6, %ymm5
116
117	vpmovmskb %ymm5, %eax
118	testl %eax, %eax
119	jz L(loop_4x_vec)
120
121	/* There is a match. */
122	vpmovmskb %ymm4, %eax
123	testl %eax, %eax
124	jnz L(last_vec_x3)
125
126	vpmovmskb %ymm3, %eax
127	testl %eax, %eax
128	jnz L(last_vec_x2)
129
130	vpmovmskb %ymm2, %eax
131	testl %eax, %eax
132	jnz L(last_vec_x1)
133
134	vpmovmskb %ymm1, %eax
135	bsrl %eax, %eax
136	addq %rdi, %rax
137	VZEROUPPER
138	ret
139
140	.p2align 4
141	L(last_4x_vec_or_less):
142	addl $(VEC_SIZE * 4), %edx
143	cmpl $(VEC_SIZE * 2), %edx
144	jbe L(last_2x_vec)
145
146	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
147	vpmovmskb %ymm1, %eax
148	testl %eax, %eax
149	jnz L(last_vec_x3)
150
151	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
152	vpmovmskb %ymm2, %eax
153	testl %eax, %eax
154	jnz L(last_vec_x2)
155
156	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
157	vpmovmskb %ymm3, %eax
158	testl %eax, %eax
159	jnz L(last_vec_x1_check)
160	cmpl $(VEC_SIZE * 3), %edx
161	jbe L(zero)
162
163	vpcmpeqb (%rdi), %ymm0, %ymm4
164	vpmovmskb %ymm4, %eax
165	testl %eax, %eax
166	jz L(zero)
167	bsrl %eax, %eax
168	subq $(VEC_SIZE * 4), %rdx
169	addq %rax, %rdx
170	jl L(zero)
171	addq %rdi, %rax
172	VZEROUPPER
173	ret
174
175	.p2align 4
176	L(last_2x_vec):
177	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
178	vpmovmskb %ymm1, %eax
179	testl %eax, %eax
180	jnz L(last_vec_x3_check)
181	cmpl $VEC_SIZE, %edx
182	jbe L(zero)
183
184	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
185	vpmovmskb %ymm1, %eax
186	testl %eax, %eax
187	jz L(zero)
188	bsrl %eax, %eax
189	subq $(VEC_SIZE * 2), %rdx
190	addq %rax, %rdx
191	jl L(zero)
192	addl $(VEC_SIZE * 2), %eax
193	addq %rdi, %rax
194	VZEROUPPER
195	ret
196
197	.p2align 4
198	L(last_vec_x0):
199	bsrl %eax, %eax
200	addq %rdi, %rax
201	VZEROUPPER
202	ret
203
204	.p2align 4
205	L(last_vec_x1):
206	bsrl %eax, %eax
207	addl $VEC_SIZE, %eax
208	addq %rdi, %rax
209	VZEROUPPER
210	ret
211
212	.p2align 4
213	L(last_vec_x2):
214	bsrl %eax, %eax
215	addl $(VEC_SIZE * 2), %eax
216	addq %rdi, %rax
217	VZEROUPPER
218	ret
219
220	.p2align 4
221	L(last_vec_x3):
222	bsrl %eax, %eax
223	addl $(VEC_SIZE * 3), %eax
224	addq %rdi, %rax
225	ret
226
227	.p2align 4
228	L(last_vec_x1_check):
229	bsrl %eax, %eax
230	subq $(VEC_SIZE * 3), %rdx
231	addq %rax, %rdx
232	jl L(zero)
233	addl $VEC_SIZE, %eax
234	addq %rdi, %rax
235	VZEROUPPER
236	ret
237
238	.p2align 4
239	L(last_vec_x3_check):
240	bsrl %eax, %eax
241	subq $VEC_SIZE, %rdx
242	addq %rax, %rdx
243	jl L(zero)
244	addl $(VEC_SIZE * 3), %eax
245	addq %rdi, %rax
246	VZEROUPPER
247	ret
248
249	.p2align 4
250	L(zero):
251	VZEROUPPER
252	L(null):
253	xorl %eax, %eax
254	ret
255
256	.p2align 4
257	L(last_vec_or_less_aligned):
258	movl %edx, %ecx
259
260	vpcmpeqb (%rdi), %ymm0, %ymm1
261
262	movl $1, %edx
263	/* Support rdx << 32. */
264	salq %cl, %rdx
265	subq $1, %rdx
266
267	vpmovmskb %ymm1, %eax
268
269	/* Remove the trailing bytes. */
270	andl %edx, %eax
271	testl %eax, %eax
272	jz L(zero)
273
274	bsrl %eax, %eax
275	addq %rdi, %rax
276	VZEROUPPER
277	ret
278
279	.p2align 4
280	L(last_vec_or_less):
281	addl $VEC_SIZE, %edx
282
283	/* Check for zero length. */
284	testl %edx, %edx
285	jz L(null)
286
287	movl %edi, %ecx
288	andl $(VEC_SIZE - 1), %ecx
289	jz L(last_vec_or_less_aligned)
290
291	movl %ecx, %esi
292	movl %ecx, %r8d
293	addl %edx, %esi
294	andq $-VEC_SIZE, %rdi
295
296	subl $VEC_SIZE, %esi
297	ja L(last_vec_2x_aligned)
298
299	/* Check the last VEC. */
300	vpcmpeqb (%rdi), %ymm0, %ymm1
301	vpmovmskb %ymm1, %eax
302
303	/* Remove the leading and trailing bytes. */
304	sarl %cl, %eax
305	movl %edx, %ecx
306
307	movl $1, %edx
308	sall %cl, %edx
309	subl $1, %edx
310
311	andl %edx, %eax
312	testl %eax, %eax
313	jz L(zero)
314
315	bsrl %eax, %eax
316	addq %rdi, %rax
317	addq %r8, %rax
318	VZEROUPPER
319	ret
320
321	.p2align 4
322	L(last_vec_2x_aligned):
323	movl %esi, %ecx
324
325	/* Check the last VEC. */
326	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
327
328	movl $1, %edx
329	sall %cl, %edx
330	subl $1, %edx
331
332	vpmovmskb %ymm1, %eax
333
334	/* Remove the trailing bytes. */
335	andl %edx, %eax
336
337	testl %eax, %eax
338	jnz L(last_vec_x1)
339
340	/* Check the second last VEC. */
341	vpcmpeqb (%rdi), %ymm0, %ymm1
342
343	movl %r8d, %ecx
344
345	vpmovmskb %ymm1, %eax
346
347	/* Remove the leading bytes. Must use unsigned right shift for
348	bsrl below. */
349	shrl %cl, %eax
350	testl %eax, %eax
351	jz L(zero)
352
353	bsrl %eax, %eax
354	addq %rdi, %rax
355	addq %r8, %rax
356	VZEROUPPER
357	ret
358	END (__memrchr_avx2)
359	#endif