[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memchr.S

/* Optimized memchr implementation for POWER8.
   Copyright (C) 2017-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5])  */

#ifndef MEMCHR
# define MEMCHR __memchr
#endif
	.machine  power8
ENTRY_TOCLESS (MEMCHR)
	CALL_MCOUNT 3
	dcbt	0, r3
	clrrdi  r8, r3, 3
	insrdi	r4, r4, 8, 48

	/* Calculate the last acceptable address and check for possible
	   addition overflow by using satured math:
	   r7 = r3 + r5
	   r7 |= -(r7 < x)  */
	add     r7, r3, r5
	subfc   r6, r3, r7
	subfe   r9, r9, r9
	extsw   r6, r9
	or      r7, r7, r6

	insrdi	r4, r4, 16, 32
	cmpldi	r5, 32
	li	r9, -1
	rlwinm	r6, r3, 3, 26, 28 /* Calculate padding.  */
	insrdi  r4, r4, 32, 0
	mr	r10, r7
	addi	r7, r7, -1
#ifdef __LITTLE_ENDIAN__
	sld	r9, r9, r6
#else
	srd	r9, r9, r6
#endif
	ble	L(small_range)
	andi.	r11, r3, 63
	beq	cr0, L(align_qw)
	clrldi	r11, r3, 61
	ld	r12, 0(r8)     /* Load doubleword from memory.  */
	cmpb	r3, r12, r4     /* Check for BYTEs in DWORD1.  */
	and	r3, r3, r9
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
	cmpldi	cr7, r3, 0      /* Does r3 indicate we got a hit?  */
	bne	cr7, L(done)
	addi	r8, r8, 8
	addi	r5, r5, -8
	add	r5, r5, r11

	/* Are we now aligned to a quadword boundary?  */
	andi.	r11, r8, 15
	beq	cr0, L(align_qw)

	/* Handle DWORD to make it QW aligned.  */
	ld	r12, 0(r8)
	cmpb	r3, r12, r4
	cmpldi	cr7, r3, 0
	bne	cr7, L(done)
	addi	r5, r5, -8
	addi	r8, r8, 8
	/* At this point, r8 is 16B aligned.  */
L(align_qw):
	vspltisb	v0, 0
	/* Precompute vbpermq constant.  */
	vspltisb	v10, 3
	li	r0, 0
	lvsl	v11, r0, r0
	vslb	v10, v11, v10
	mtvrd	v1, r4
	vspltb	v1, v1, 7
	cmpldi	r5, 64
	ble	L(tail64)
	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
	   Note: aligning to 64-byte will necessarily slow down performance for
	   strings around 64 bytes in length due to the extra comparisons
	   required to check alignment for the vectorized loop.  This is a
	   necessary tradeoff we are willing to take in order to speed up the
	   calculation for larger strings.  */
	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	/* In order to begin the 64B loop, it needs to be 64
	   bytes aligned.  So read until it is 64B aligned.  */
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16

	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16

	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16
	/* At this point it should be 64B aligned.
	   Prepare for the 64B loop.  */
L(preloop_64B):
	cmpldi	r5, 64		/* Check if r5 < 64.  */
	ble	L(tail64)
	sub	r6, r10, r8
	srdi	r9, r6, 6	/* Number of loop iterations.  */
	mtctr	r9		/* Setup the counter.  */
	li	r11, 16		/* Load required offsets.  */
	li	r9, 32
	li	r7, 48

	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
	.align 4
L(loop):
	lvx	v2, 0, r8	/* Load 4 quadwords.  */
	lvx	v3, r8, r11
	lvx	v4, v8, r9
	lvx	v5, v8, r7
	vcmpequb	v6, v1, v2
	vcmpequb	v7, v1, v3
	vcmpequb	v8, v1, v4
	vcmpequb	v9, v1, v5
	vor	v11, v6, v7
	vor	v12, v8, v9
	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
	vcmpequb.	v11, v0, v11
	bnl	cr6, L(found)
	addi	r8, r8, 64	/* Adjust address for the next iteration.  */
	bdnz	L(loop)
	clrldi	r5, r6, 58

	/* Handle remainder of 64B loop or r5 > 64.  */
	.align	4
L(tail64):
	cmpldi	r5, 0
	beq	L(null)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	li	r3, 0
	blr

	/* Found a match in 64B loop.  */
	.align	4
L(found):
	/* Permute the first bit of each byte into bits 48-63.  */
	vbpermq	v6, v6, v10
	vbpermq	v7, v7, v10
	vbpermq	v8, v8, v10
	vbpermq	v9, v9, v10
	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	vsldoi	v7, v7, v7, 2
	vsldoi	v8, v8, v8, 4
	vsldoi	v9, v9, v9, 6
#else
	vsldoi	v6, v6, v6, 6
	vsldoi	v7, v7, v7, 4
	vsldoi	v8, v8, v8, 2
#endif
	/* Merge the results and move to a GPR.  */
	vor	v11, v6, v7
	vor	v4, v9, v8
	vor	v4, v11, v4
	mfvrd	r5, v4
#ifdef __LITTLE_ENDIAN__
	addi	r6, r5, -1
	andc	r6, r6, r5
	popcntd	r6, r6
#else
	cntlzd	r6, r5	/* Count leading zeros before the match.  */
#endif
	add	r3, r8, r6	/* Compute final length.  */
	blr

	/* Found a match in last 16 bytes.  */
	.align	4
L(found_16B):
	/* Permute the first bit of each byte into bits 48-63.  */
	vbpermq	v6, v6, v10
	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	mfvrd	r7, v6
	addi	r6, r7, -1
	andc	r6, r6, r7
	popcntd	r6, r6
#else
	vsldoi	v6, v6, v6, 6
	mfvrd	r7, v6
	cntlzd	r6, r7	/* Count leading zeros before the match.  */
#endif
	add	r3, r8, r6	/* Compute final length.  */
	cmpld	r6, r5
	bltlr
	li	r3, 0
	blr

	.align	4
	/* r3 has the output of the cmpb instruction, that is, it contains
	   0xff in the same position as BYTE in the original
	   doubleword from the string.  Use that to calculate the pointer.
	   We need to make sure BYTE is *before* the end of the range.  */
L(done):
#ifdef __LITTLE_ENDIAN__
	addi	r0, r3, -1
	andc	r0, r0, r3
	popcntd	r0, r0	      /* Count trailing zeros.  */
#else
	cntlzd	r0, r3	      /* Count leading zeros before the match.  */
#endif
	cmpld	r8, r7         /* Are we on the last dword?  */
	srdi	r0, r0, 3	/* Convert leading/trailing zeros to bytes.  */
	add	r3, r8, r0
	cmpld	cr7, r0, r6     /* If on the last dword, check byte offset.  */
	bnelr
	blelr	cr7
	li	r3, 0
	blr

	.align	4
L(null):
	li	r3, 0
	blr

/* Deals with size <= 32.  */
	.align	4
L(small_range):
	cmpldi	r5, 0
	beq	L(null)
	ld	r12, 0(r8)     /* Load word from memory.  */
	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
	and	r3, r3, r9
	cmpldi	cr7, r3, 0
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
	cmpld	r8, r7         /* Are we done already?  */
	bne	cr7, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)   /* Found something.  */
	beqlr		      /* Hit end of string (length).  */

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	bne	cr6, L(done)
	blr

END (MEMCHR)
weak_alias (__memchr, memchr)
libc_hidden_builtin_def (memchr)
Commit	Line	Data
43e0ac24	1	/* Optimized memchr implementation for POWER8.
2b778ceb	2	Copyright (C) 2017-2021 Free Software Foundation, Inc.
43e0ac24 RS	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
43e0ac24 RS	18
	19	#include <sysdep.h>
	20
	21	/* void [r3] memchr (const void s [r3], int c [r4], size_t n [r5]) */
	22
43e0ac24 RS	23	#ifndef MEMCHR
	24	# define MEMCHR __memchr
	25	#endif
066020c5	26	.machine power8
43e0ac24 RS	27	ENTRY_TOCLESS (MEMCHR)
	28	CALL_MCOUNT 3
	29	dcbt 0, r3
	30	clrrdi r8, r3, 3
	31	insrdi r4, r4, 8, 48
	32
	33	/* Calculate the last acceptable address and check for possible
	34	addition overflow by using satured math:
	35	r7 = r3 + r5
	36	r7 \|= -(r7 < x) */
	37	add r7, r3, r5
	38	subfc r6, r3, r7
	39	subfe r9, r9, r9
	40	extsw r6, r9
	41	or r7, r7, r6
	42
	43	insrdi r4, r4, 16, 32
	44	cmpldi r5, 32
	45	li r9, -1
	46	rlwinm r6, r3, 3, 26, 28 /* Calculate padding. */
	47	insrdi r4, r4, 32, 0
	48	mr r10, r7
	49	addi r7, r7, -1
	50	#ifdef __LITTLE_ENDIAN__
	51	sld r9, r9, r6
	52	#else
	53	srd r9, r9, r6
	54	#endif
	55	ble L(small_range)
	56	andi. r11, r3, 63
	57	beq cr0, L(align_qw)
	58	clrldi r11, r3, 61
	59	ld r12, 0(r8) /* Load doubleword from memory. */
	60	cmpb r3, r12, r4 /* Check for BYTEs in DWORD1. */
	61	and r3, r3, r9
	62	clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
	63	clrrdi r7, r7, 3 /* Address of last doubleword. */
	64	cmpldi cr7, r3, 0 /* Does r3 indicate we got a hit? */
	65	bne cr7, L(done)
	66	addi r8, r8, 8
	67	addi r5, r5, -8
	68	add r5, r5, r11
	69
	70	/* Are we now aligned to a quadword boundary? */
	71	andi. r11, r8, 15
	72	beq cr0, L(align_qw)
	73
	74	/* Handle DWORD to make it QW aligned. */
	75	ld r12, 0(r8)
	76	cmpb r3, r12, r4
	77	cmpldi cr7, r3, 0
	78	bne cr7, L(done)
	79	addi r5, r5, -8
	80	addi r8, r8, 8
	81	/* At this point, r8 is 16B aligned. */
	82	L(align_qw):
	83	vspltisb v0, 0
	84	/* Precompute vbpermq constant. */
	85	vspltisb v10, 3
	86	li r0, 0
	87	lvsl v11, r0, r0
	88	vslb v10, v11, v10
066020c5	89	mtvrd v1, r4
43e0ac24 RS	90	vspltb v1, v1, 7
	91	cmpldi r5, 64
	92	ble L(tail64)
	93	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
	94	Note: aligning to 64-byte will necessarily slow down performance for
	95	strings around 64 bytes in length due to the extra comparisons
	96	required to check alignment for the vectorized loop. This is a
	97	necessary tradeoff we are willing to take in order to speed up the
	98	calculation for larger strings. */
	99	andi. r11, r8, 63
	100	beq cr0, L(preloop_64B)
	101	/* In order to begin the 64B loop, it needs to be 64
	102	bytes aligned. So read until it is 64B aligned. */
	103	lvx v4, 0, r8
	104	vcmpequb v6, v1, v4
	105	vcmpequb. v11, v0, v6
	106	bnl cr6, L(found_16B)
	107	addi r8, r8, 16
	108	addi r5, r5, -16
	109
	110	andi. r11, r8, 63
	111	beq cr0, L(preloop_64B)
	112	lvx v4, 0, r8
	113	vcmpequb v6, v1, v4
	114	vcmpequb. v11, v0, v6
	115	bnl cr6, L(found_16B)
	116	addi r8, r8, 16
	117	addi r5, r5, -16
	118
	119	andi. r11, r8, 63
	120	beq cr0, L(preloop_64B)
	121	lvx v4, 0, r8
	122	vcmpequb v6, v1, v4
	123	vcmpequb. v11, v0, v6
	124	bnl cr6, L(found_16B)
	125	addi r8, r8, 16
	126	addi r5, r5, -16
	127	/* At this point it should be 64B aligned.
	128	Prepare for the 64B loop. */
	129	L(preloop_64B):
	130	cmpldi r5, 64 /* Check if r5 < 64. */
	131	ble L(tail64)
	132	sub r6, r10, r8
	133	srdi r9, r6, 6 /* Number of loop iterations. */
	134	mtctr r9 /* Setup the counter. */
	135	li r11, 16 /* Load required offsets. */
	136	li r9, 32
	137	li r7, 48
	138
	139	/* Handle r5 > 64. Loop over the bytes in strides of 64B. */
	140	.align 4
	141	L(loop):
	142	lvx v2, 0, r8 /* Load 4 quadwords. */
	143	lvx v3, r8, r11
	144	lvx v4, v8, r9
	145	lvx v5, v8, r7
	146	vcmpequb v6, v1, v2
	147	vcmpequb v7, v1, v3
	148	vcmpequb v8, v1, v4
	149	vcmpequb v9, v1, v5
	150	vor v11, v6, v7
	151	vor v12, v8, v9
	152	vor v11, v11, v12 /* Compare and merge into one VR for speed. */
	153	vcmpequb. v11, v0, v11
154	bnl cr6, L(found)
155	addi r8, r8, 64 /* Adjust address for the next iteration. */
156	bdnz L(loop)
157	clrldi r5, r6, 58
158
159	/* Handle remainder of 64B loop or r5 > 64. */
160	.align 4
161	L(tail64):
162	cmpldi r5, 0
163	beq L(null)
164	lvx v4, 0, r8
165	vcmpequb v6, v1, v4
166	vcmpequb. v11, v0, v6
167	bnl cr6, L(found_16B)
168	addi r8, r8, 16
169	cmpldi cr6, r5, 16
170	ble cr6, L(null)
171	addi r5, r5, -16
172
173	lvx v4, 0, r8
174	vcmpequb v6, v1, v4
175	vcmpequb. v11, v0, v6
176	bnl cr6, L(found_16B)
177	addi r8, r8, 16
178	cmpldi cr6, r5, 16
179	ble cr6, L(null)
180	addi r5, r5, -16
181
182	lvx v4, 0, r8
183	vcmpequb v6, v1, v4
184	vcmpequb. v11, v0, v6
185	bnl cr6, L(found_16B)
186	addi r8, r8, 16
187	cmpldi cr6, r5, 16
188	ble cr6, L(null)
189	addi r5, r5, -16
190
191	lvx v4, 0, r8
192	vcmpequb v6, v1, v4
193	vcmpequb. v11, v0, v6
194	bnl cr6, L(found_16B)
195	li r3, 0
196	blr
197
198	/* Found a match in 64B loop. */
199	.align 4
200	L(found):
201	/* Permute the first bit of each byte into bits 48-63. */
066020c5 RFF	202	vbpermq v6, v6, v10
	203	vbpermq v7, v7, v10
	204	vbpermq v8, v8, v10
	205	vbpermq v9, v9, v10
43e0ac24 RS	206	/* Shift each component into its correct position for merging. */
	207	#ifdef __LITTLE_ENDIAN__
	208	vsldoi v7, v7, v7, 2
	209	vsldoi v8, v8, v8, 4
	210	vsldoi v9, v9, v9, 6
	211	#else
	212	vsldoi v6, v6, v6, 6
	213	vsldoi v7, v7, v7, 4
	214	vsldoi v8, v8, v8, 2
	215	#endif
	216	/* Merge the results and move to a GPR. */
	217	vor v11, v6, v7
	218	vor v4, v9, v8
	219	vor v4, v11, v4
066020c5	220	mfvrd r5, v4
43e0ac24 RS	221	#ifdef __LITTLE_ENDIAN__
	222	addi r6, r5, -1
	223	andc r6, r6, r5
	224	popcntd r6, r6
	225	#else
	226	cntlzd r6, r5 /* Count leading zeros before the match. */
	227	#endif
	228	add r3, r8, r6 /* Compute final length. */
	229	blr
	230
	231	/* Found a match in last 16 bytes. */
	232	.align 4
	233	L(found_16B):
	234	/* Permute the first bit of each byte into bits 48-63. */
066020c5	235	vbpermq v6, v6, v10
43e0ac24 RS	236	/* Shift each component into its correct position for merging. */
43e0ac24 RS	237	#ifdef __LITTLE_ENDIAN__
066020c5	238	mfvrd r7, v6
43e0ac24 RS	239	addi r6, r7, -1
	240	andc r6, r6, r7
	241	popcntd r6, r6
	242	#else
	243	vsldoi v6, v6, v6, 6
066020c5	244	mfvrd r7, v6
43e0ac24 RS	245	cntlzd r6, r7 /* Count leading zeros before the match. */
	246	#endif
	247	add r3, r8, r6 /* Compute final length. */
	248	cmpld r6, r5
	249	bltlr
	250	li r3, 0
	251	blr
	252
	253	.align 4
	254	/* r3 has the output of the cmpb instruction, that is, it contains
	255	0xff in the same position as BYTE in the original
	256	doubleword from the string. Use that to calculate the pointer.
	257	We need to make sure BYTE is before the end of the range. */
	258	L(done):
	259	#ifdef __LITTLE_ENDIAN__
	260	addi r0, r3, -1
	261	andc r0, r0, r3
	262	popcntd r0, r0 /* Count trailing zeros. */
	263	#else
	264	cntlzd r0, r3 /* Count leading zeros before the match. */
	265	#endif
	266	cmpld r8, r7 /* Are we on the last dword? */
	267	srdi r0, r0, 3 /* Convert leading/trailing zeros to bytes. */
	268	add r3, r8, r0
	269	cmpld cr7, r0, r6 /* If on the last dword, check byte offset. */
	270	bnelr
	271	blelr cr7
	272	li r3, 0
	273	blr
	274
	275	.align 4
	276	L(null):
	277	li r3, 0
	278	blr
	279
	280	/* Deals with size <= 32. */
	281	.align 4
	282	L(small_range):
	283	cmpldi r5, 0
	284	beq L(null)
	285	ld r12, 0(r8) /* Load word from memory. */
	286	cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
	287	and r3, r3, r9
	288	cmpldi cr7, r3, 0
	289	clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
	290	clrrdi r7, r7, 3 /* Address of last doubleword. */
	291	cmpld r8, r7 /* Are we done already? */
	292	bne cr7, L(done)
	293	beqlr
	294
	295	ldu r12, 8(r8)
	296	cmpb r3, r12, r4
	297	cmpldi cr6, r3, 0
	298	cmpld r8, r7
	299	bne cr6, L(done) /* Found something. */
	300	beqlr /* Hit end of string (length). */
	301
	302	ldu r12, 8(r8)
	303	cmpb r3, r12, r4
	304	cmpldi cr6, r3, 0
	305	cmpld r8, r7
	306	bne cr6, L(done)
	307	beqlr
	308
309	ldu r12, 8(r8)
310	cmpb r3, r12, r4
311	cmpldi cr6, r3, 0
312	cmpld r8, r7
313	bne cr6, L(done)
314	beqlr
315
316	ldu r12, 8(r8)
317	cmpb r3, r12, r4
318	cmpldi cr6, r3, 0
319	bne cr6, L(done)
320	blr
321
322	END (MEMCHR)
323	weak_alias (__memchr, memchr)
324	libc_hidden_builtin_def (memchr)