[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memrchr-sse2.S

/* memrchr optimized with SSE2.
   Copyright (C) 2017-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
   so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)

# ifndef MEMRCHR
#  define MEMRCHR __memrchr_sse2
# endif

# include <sysdep.h>
# define VEC_SIZE			16
# define PAGE_SIZE			4096

	.text
ENTRY_P2ALIGN(MEMRCHR, 6)
# ifdef __ILP32__
	/* Clear upper bits.  */
	mov	%RDX_LP, %RDX_LP
# endif
	movd	%esi, %xmm0

	/* Get end pointer.  */
	leaq	(%rdx, %rdi), %rcx

	punpcklbw %xmm0, %xmm0
	punpcklwd %xmm0, %xmm0
	pshufd	$0, %xmm0, %xmm0

	/* Check if we can load 1x VEC without cross a page.  */
	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
	jz	L(page_cross)

	/* NB: This load happens regardless of whether rdx (len) is zero. Since
	   it doesn't cross a page and the standard gurantees any pointer have
	   at least one-valid byte this load must be safe. For the entire
	   history of the x86 memrchr implementation this has been possible so
	   no code "should" be relying on a zero-length check before this load.
	   The zero-length check is moved to the page cross case because it is
	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
	   into 2-cache lines.  */
	movups	-(VEC_SIZE)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	subq	$VEC_SIZE, %rdx
	ja	L(more_1x_vec)
L(ret_vec_x0_test):
	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
	   zero.  */
	bsrl	%eax, %eax
	jz	L(ret_0)
	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
	   if out of bounds.  */
	addl	%edx, %eax
	jl	L(zero_0)
	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
	   ptr.  */
	addq	%rdi, %rax
L(ret_0):
	ret

	.p2align 4,, 5
L(ret_vec_x0):
	bsrl	%eax, %eax
	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
	ret

	.p2align 4,, 2
L(zero_0):
	xorl	%eax, %eax
	ret


	.p2align 4,, 8
L(more_1x_vec):
	testl	%eax, %eax
	jnz	L(ret_vec_x0)

	/* Align rcx (pointer to string).  */
	decq	%rcx
	andq	$-VEC_SIZE, %rcx

	movq	%rcx, %rdx
	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
	   it adds more frontend uops (even if the moves can be eliminated) and
	   some percentage of the time actual backend uops.  */
	movaps	-(VEC_SIZE)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	subq	%rdi, %rdx
	pmovmskb %xmm1, %eax

	cmpq	$(VEC_SIZE * 2), %rdx
	ja	L(more_2x_vec)
L(last_2x_vec):
	subl	$VEC_SIZE, %edx
	jbe	L(ret_vec_x0_test)

	testl	%eax, %eax
	jnz	L(ret_vec_x0)

	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	subl	$VEC_SIZE, %edx
	bsrl	%eax, %eax
	jz	L(ret_1)
	addl	%edx, %eax
	jl	L(zero_0)
	addq	%rdi, %rax
L(ret_1):
	ret

	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
	   lines.  Naturally aligned % 16 to 8-bytes.  */
L(page_cross):
	/* Zero length check.  */
	testq	%rdx, %rdx
	jz	L(zero_0)

	leaq	-1(%rcx), %r8
	andq	$-(VEC_SIZE), %r8

	movaps	(%r8), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %esi
	/* Shift out negative alignment (because we are starting from endptr and
	   working backwards).  */
	negl	%ecx
	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
	   explicitly.  */
	andl	$(VEC_SIZE - 1), %ecx
	shl	%cl, %esi
	movzwl	%si, %eax
	leaq	(%rdi, %rdx), %rcx
	cmpq	%rdi, %r8
	ja	L(more_1x_vec)
	subl	$VEC_SIZE, %edx
	bsrl	%eax, %eax
	jz	L(ret_2)
	addl	%edx, %eax
	jl	L(zero_1)
	addq	%rdi, %rax
L(ret_2):
	ret

	/* Fits in aliging bytes.  */
L(zero_1):
	xorl	%eax, %eax
	ret

	.p2align 4,, 5
L(ret_vec_x1):
	bsrl	%eax, %eax
	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_2x_vec):
	testl	%eax, %eax
	jnz	L(ret_vec_x0)

	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	testl	%eax, %eax
	jnz	L(ret_vec_x1)


	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	subq	$(VEC_SIZE * 4), %rdx
	ja	L(more_4x_vec)

	addl	$(VEC_SIZE), %edx
	jle	L(ret_vec_x2_test)

L(last_vec):
	testl	%eax, %eax
	jnz	L(ret_vec_x2)

	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	subl	$(VEC_SIZE), %edx
	bsrl	%eax, %eax
	jz	L(ret_3)
	addl	%edx, %eax
	jl	L(zero_2)
	addq	%rdi, %rax
L(ret_3):
	ret

	.p2align 4,, 6
L(ret_vec_x2_test):
	bsrl	%eax, %eax
	jz	L(zero_2)
	addl	%edx, %eax
	jl	L(zero_2)
	addq	%rdi, %rax
	ret

L(zero_2):
	xorl	%eax, %eax
	ret


	.p2align 4,, 5
L(ret_vec_x2):
	bsrl	%eax, %eax
	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
	ret

	.p2align 4,, 5
L(ret_vec_x3):
	bsrl	%eax, %eax
	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_4x_vec):
	testl	%eax, %eax
	jnz	L(ret_vec_x2)

	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	testl	%eax, %eax
	jnz	L(ret_vec_x3)

	addq	$-(VEC_SIZE * 4), %rcx
	cmpq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_4x_vec)

	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
	   keeping the code from spilling to the next cache line.  */
	addq	$(VEC_SIZE * 4 - 1), %rcx
	andq	$-(VEC_SIZE * 4), %rcx
	leaq	(VEC_SIZE * 4)(%rdi), %rdx
	andq	$-(VEC_SIZE * 4), %rdx

	.p2align 4,, 11
L(loop_4x_vec):
	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
	pcmpeqb	%xmm0, %xmm1
	pcmpeqb	%xmm0, %xmm2
	pcmpeqb	%xmm0, %xmm3
	pcmpeqb	%xmm0, %xmm4

	por	%xmm1, %xmm2
	por	%xmm3, %xmm4
	por	%xmm2, %xmm4

	pmovmskb %xmm4, %esi
	testl	%esi, %esi
	jnz	L(loop_end)

	addq	$-(VEC_SIZE * 4), %rcx
	cmpq	%rdx, %rcx
	jne	L(loop_4x_vec)

	subl	%edi, %edx

	/* Ends up being 1-byte nop.  */
	.p2align 4,, 2
L(last_4x_vec):
	movaps	-(VEC_SIZE)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	cmpl	$(VEC_SIZE * 2), %edx
	jbe	L(last_2x_vec)

	testl	%eax, %eax
	jnz	L(ret_vec_x0)


	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	testl	%eax, %eax
	jnz	L(ret_vec_end)

	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb %xmm1, %eax

	subl	$(VEC_SIZE * 3), %edx
	ja	L(last_vec)
	bsrl	%eax, %eax
	jz	L(ret_4)
	addl	%edx, %eax
	jl	L(zero_3)
	addq	%rdi, %rax
L(ret_4):
	ret

	/* Ends up being 1-byte nop.  */
	.p2align 4,, 3
L(loop_end):
	pmovmskb %xmm1, %eax
	sall	$16, %eax
	jnz	L(ret_vec_end)

	pmovmskb %xmm2, %eax
	testl	%eax, %eax
	jnz	L(ret_vec_end)

	pmovmskb %xmm3, %eax
	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
	   then it won't affect the result in esi (VEC4). If ecx is non-zero
	   then CHAR in VEC3 and bsrq will use that position.  */
	sall	$16, %eax
	orl	%esi, %eax
	bsrl	%eax, %eax
	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
	ret

L(ret_vec_end):
	bsrl	%eax, %eax
	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
	ret
	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
	   aligning bytes.  */
L(zero_3):
	xorl	%eax, %eax
	ret
	/* 2-bytes from next cache line.  */
END(MEMRCHR)
#endif
Commit	Line	Data
5ac7aa1d	1	/* memrchr optimized with SSE2.
581c785b	2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
5ac7aa1d L	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
5ac7aa1d	18
ceabdcd1 NG	19	#include <isa-level.h>
	20
	21	/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
	22	so we need this to build for ISA V2 builds. */
	23	#if ISA_SHOULD_BUILD (2)
	24
08af081f NG	25	# ifndef MEMRCHR
	26	# define MEMRCHR __memrchr_sse2
	27	# endif
08af081f	28
ceabdcd1 NG	29	# include <sysdep.h>
	30	# define VEC_SIZE 16
	31	# define PAGE_SIZE 4096
5ac7aa1d	32
08af081f NG	33	.text
08af081f NG	34	ENTRY_P2ALIGN(MEMRCHR, 6)
ceabdcd1	35	# ifdef __ILP32__
08af081f NG	36	/* Clear upper bits. */
08af081f NG	37	mov %RDX_LP, %RDX_LP
ceabdcd1	38	# endif
08af081f NG	39	movd %esi, %xmm0
	40
	41	/* Get end pointer. */
	42	leaq (%rdx, %rdi), %rcx
	43
	44	punpcklbw %xmm0, %xmm0
	45	punpcklwd %xmm0, %xmm0
	46	pshufd $0, %xmm0, %xmm0
	47
	48	/* Check if we can load 1x VEC without cross a page. */
	49	testl $(PAGE_SIZE - VEC_SIZE), %ecx
	50	jz L(page_cross)
	51
	52	/* NB: This load happens regardless of whether rdx (len) is zero. Since
	53	it doesn't cross a page and the standard gurantees any pointer have
	54	at least one-valid byte this load must be safe. For the entire
	55	history of the x86 memrchr implementation this has been possible so
	56	no code "should" be relying on a zero-length check before this load.
	57	The zero-length check is moved to the page cross case because it is
	58	1) pretty cold and including it pushes the hot case len <= VEC_SIZE
	59	into 2-cache lines. */
	60	movups -(VEC_SIZE)(%rcx), %xmm1
	61	pcmpeqb %xmm0, %xmm1
	62	pmovmskb %xmm1, %eax
	63
	64	subq $VEC_SIZE, %rdx
	65	ja L(more_1x_vec)
	66	L(ret_vec_x0_test):
	67	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
	68	zero. */
	69	bsrl %eax, %eax
	70	jz L(ret_0)
	71	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
	72	if out of bounds. */
	73	addl %edx, %eax
	74	jl L(zero_0)
	75	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
	76	ptr. */
	77	addq %rdi, %rax
	78	L(ret_0):
	79	ret
	80
	81	.p2align 4,, 5
	82	L(ret_vec_x0):
	83	bsrl %eax, %eax
	84	leaq -(VEC_SIZE)(%rcx, %rax), %rax
	85	ret
	86
	87	.p2align 4,, 2
	88	L(zero_0):
	89	xorl %eax, %eax
	90	ret
	91
	92
	93	.p2align 4,, 8
	94	L(more_1x_vec):
	95	testl %eax, %eax
	96	jnz L(ret_vec_x0)
	97
	98	/* Align rcx (pointer to string). */
	99	decq %rcx
	100	andq $-VEC_SIZE, %rcx
	101
	102	movq %rcx, %rdx
103	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
104	%xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
105	it adds more frontend uops (even if the moves can be eliminated) and
106	some percentage of the time actual backend uops. */
107	movaps -(VEC_SIZE)(%rcx), %xmm1
108	pcmpeqb %xmm0, %xmm1
109	subq %rdi, %rdx
110	pmovmskb %xmm1, %eax
111
112	cmpq $(VEC_SIZE * 2), %rdx
113	ja L(more_2x_vec)
114	L(last_2x_vec):
115	subl $VEC_SIZE, %edx
116	jbe L(ret_vec_x0_test)
117
118	testl %eax, %eax
119	jnz L(ret_vec_x0)
120
121	movaps -(VEC_SIZE * 2)(%rcx), %xmm1
122	pcmpeqb %xmm0, %xmm1
123	pmovmskb %xmm1, %eax
124
125	subl $VEC_SIZE, %edx
126	bsrl %eax, %eax
127	jz L(ret_1)
128	addl %edx, %eax
129	jl L(zero_0)
130	addq %rdi, %rax
131	L(ret_1):
132	ret
133
134	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
135	causes the hot pause (length <= VEC_SIZE) to span multiple cache
136	lines. Naturally aligned % 16 to 8-bytes. */
137	L(page_cross):
138	/* Zero length check. */
139	testq %rdx, %rdx
140	jz L(zero_0)
141
142	leaq -1(%rcx), %r8
143	andq $-(VEC_SIZE), %r8
144
145	movaps (%r8), %xmm1
146	pcmpeqb %xmm0, %xmm1
147	pmovmskb %xmm1, %esi
148	/* Shift out negative alignment (because we are starting from endptr and
149	working backwards). */
150	negl %ecx
151	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
152	explicitly. */
153	andl $(VEC_SIZE - 1), %ecx
154	shl %cl, %esi
155	movzwl %si, %eax
156	leaq (%rdi, %rdx), %rcx
157	cmpq %rdi, %r8
158	ja L(more_1x_vec)
159	subl $VEC_SIZE, %edx
160	bsrl %eax, %eax
161	jz L(ret_2)
162	addl %edx, %eax
163	jl L(zero_1)
164	addq %rdi, %rax
165	L(ret_2):
166	ret
167
168	/* Fits in aliging bytes. */
169	L(zero_1):
170	xorl %eax, %eax
171	ret
172
173	.p2align 4,, 5
174	L(ret_vec_x1):
175	bsrl %eax, %eax
176	leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
177	ret
178
179	.p2align 4,, 8
180	L(more_2x_vec):
181	testl %eax, %eax
182	jnz L(ret_vec_x0)
183
184	movaps -(VEC_SIZE * 2)(%rcx), %xmm1
185	pcmpeqb %xmm0, %xmm1
186	pmovmskb %xmm1, %eax
187	testl %eax, %eax
188	jnz L(ret_vec_x1)
189
190
191	movaps -(VEC_SIZE * 3)(%rcx), %xmm1
192	pcmpeqb %xmm0, %xmm1
193	pmovmskb %xmm1, %eax
194
195	subq $(VEC_SIZE * 4), %rdx
196	ja L(more_4x_vec)
197
198	addl $(VEC_SIZE), %edx
199	jle L(ret_vec_x2_test)
200
201	L(last_vec):
202	testl %eax, %eax
203	jnz L(ret_vec_x2)
204
205	movaps -(VEC_SIZE * 4)(%rcx), %xmm1
206	pcmpeqb %xmm0, %xmm1
207	pmovmskb %xmm1, %eax
208
209	subl $(VEC_SIZE), %edx
210	bsrl %eax, %eax
211	jz L(ret_3)
212	addl %edx, %eax
213	jl L(zero_2)
214	addq %rdi, %rax
215	L(ret_3):
216	ret
217
218	.p2align 4,, 6
219	L(ret_vec_x2_test):
220	bsrl %eax, %eax
221	jz L(zero_2)
222	addl %edx, %eax
223	jl L(zero_2)
224	addq %rdi, %rax
225	ret
226
227	L(zero_2):
228	xorl %eax, %eax
229	ret
230
231
232	.p2align 4,, 5
233	L(ret_vec_x2):
234	bsrl %eax, %eax
235	leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
236	ret
237
238	.p2align 4,, 5
239	L(ret_vec_x3):
240	bsrl %eax, %eax
241	leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
242	ret
243
244	.p2align 4,, 8
245	L(more_4x_vec):
246	testl %eax, %eax
247	jnz L(ret_vec_x2)
248
249	movaps -(VEC_SIZE * 4)(%rcx), %xmm1
250	pcmpeqb %xmm0, %xmm1
251	pmovmskb %xmm1, %eax
252
253	testl %eax, %eax
254	jnz L(ret_vec_x3)
255
256	addq $-(VEC_SIZE * 4), %rcx
257	cmpq $(VEC_SIZE * 4), %rdx
258	jbe L(last_4x_vec)
259
260	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
261	keeping the code from spilling to the next cache line. */
262	addq $(VEC_SIZE * 4 - 1), %rcx
263	andq $-(VEC_SIZE * 4), %rcx
264	leaq (VEC_SIZE * 4)(%rdi), %rdx
265	andq $-(VEC_SIZE * 4), %rdx
266
267	.p2align 4,, 11
268	L(loop_4x_vec):
269	movaps (VEC_SIZE * -1)(%rcx), %xmm1
270	movaps (VEC_SIZE * -2)(%rcx), %xmm2
271	movaps (VEC_SIZE * -3)(%rcx), %xmm3
272	movaps (VEC_SIZE * -4)(%rcx), %xmm4
273	pcmpeqb %xmm0, %xmm1
274	pcmpeqb %xmm0, %xmm2
275	pcmpeqb %xmm0, %xmm3
276	pcmpeqb %xmm0, %xmm4
277
278	por %xmm1, %xmm2
279	por %xmm3, %xmm4
280	por %xmm2, %xmm4
281
282	pmovmskb %xmm4, %esi
283	testl %esi, %esi
284	jnz L(loop_end)
285
286	addq $-(VEC_SIZE * 4), %rcx
287	cmpq %rdx, %rcx
288	jne L(loop_4x_vec)
289
290	subl %edi, %edx
291
292	/* Ends up being 1-byte nop. */
293	.p2align 4,, 2
294	L(last_4x_vec):
295	movaps -(VEC_SIZE)(%rcx), %xmm1
296	pcmpeqb %xmm0, %xmm1
297	pmovmskb %xmm1, %eax
298
299	cmpl $(VEC_SIZE * 2), %edx
300	jbe L(last_2x_vec)
301
302	testl %eax, %eax
303	jnz L(ret_vec_x0)
304
305
306	movaps -(VEC_SIZE * 2)(%rcx), %xmm1
307	pcmpeqb %xmm0, %xmm1
308	pmovmskb %xmm1, %eax
309
310	testl %eax, %eax
311	jnz L(ret_vec_end)
312
313	movaps -(VEC_SIZE * 3)(%rcx), %xmm1
314	pcmpeqb %xmm0, %xmm1
315	pmovmskb %xmm1, %eax
316
317	subl $(VEC_SIZE * 3), %edx
318	ja L(last_vec)
319	bsrl %eax, %eax
320	jz L(ret_4)
321	addl %edx, %eax
322	jl L(zero_3)
323	addq %rdi, %rax
324	L(ret_4):
325	ret
326
327	/* Ends up being 1-byte nop. */
328	.p2align 4,, 3
329	L(loop_end):
330	pmovmskb %xmm1, %eax
331	sall $16, %eax
332	jnz L(ret_vec_end)
333
334	pmovmskb %xmm2, %eax
335	testl %eax, %eax
336	jnz L(ret_vec_end)
337
338	pmovmskb %xmm3, %eax
339	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
340	then it won't affect the result in esi (VEC4). If ecx is non-zero
341	then CHAR in VEC3 and bsrq will use that position. */
342	sall $16, %eax
343	orl %esi, %eax
344	bsrl %eax, %eax
345	leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
346	ret
5ac7aa1d	347
08af081f NG	348	L(ret_vec_end):
	349	bsrl %eax, %eax
	350	leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
	351	ret
	352	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
	353	aligning bytes. */
	354	L(zero_3):
	355	xorl %eax, %eax
	356	ret
	357	/* 2-bytes from next cache line. */
	358	END(MEMRCHR)
ceabdcd1	359	#endif