[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memchr.S

/* Optimized memchr implementation for POWER8.
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5])  */

/* TODO: change these to the actual instructions when the minimum required
   binutils allows it.  */
#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
#define VBPERMQ(t, a, b)  .long (0x1000054c \
				| ((t)<<(32-11)) \
				| ((a)<<(32-16)) \
				| ((b)<<(32-21)) )

#ifndef MEMCHR
# define MEMCHR __memchr
#endif
/* TODO: change this to .machine power8 when the minimum required binutils
   allows it.  */
	.machine  power7
ENTRY_TOCLESS (MEMCHR)
	CALL_MCOUNT 3
	dcbt	0, r3
	clrrdi  r8, r3, 3
	insrdi	r4, r4, 8, 48

	/* Calculate the last acceptable address and check for possible
	   addition overflow by using satured math:
	   r7 = r3 + r5
	   r7 |= -(r7 < x)  */
	add     r7, r3, r5
	subfc   r6, r3, r7
	subfe   r9, r9, r9
	extsw   r6, r9
	or      r7, r7, r6

	insrdi	r4, r4, 16, 32
	cmpldi	r5, 32
	li	r9, -1
	rlwinm	r6, r3, 3, 26, 28 /* Calculate padding.  */
	insrdi  r4, r4, 32, 0
	mr	r10, r7
	addi	r7, r7, -1
#ifdef __LITTLE_ENDIAN__
	sld	r9, r9, r6
#else
	srd	r9, r9, r6
#endif
	ble	L(small_range)
	andi.	r11, r3, 63
	beq	cr0, L(align_qw)
	clrldi	r11, r3, 61
	ld	r12, 0(r8)     /* Load doubleword from memory.  */
	cmpb	r3, r12, r4     /* Check for BYTEs in DWORD1.  */
	and	r3, r3, r9
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
	cmpldi	cr7, r3, 0      /* Does r3 indicate we got a hit?  */
	bne	cr7, L(done)
	addi	r8, r8, 8
	addi	r5, r5, -8
	add	r5, r5, r11

	/* Are we now aligned to a quadword boundary?  */
	andi.	r11, r8, 15
	beq	cr0, L(align_qw)

	/* Handle DWORD to make it QW aligned.  */
	ld	r12, 0(r8)
	cmpb	r3, r12, r4
	cmpldi	cr7, r3, 0
	bne	cr7, L(done)
	addi	r5, r5, -8
	addi	r8, r8, 8
	/* At this point, r8 is 16B aligned.  */
L(align_qw):
	vspltisb	v0, 0
	/* Precompute vbpermq constant.  */
	vspltisb	v10, 3
	li	r0, 0
	lvsl	v11, r0, r0
	vslb	v10, v11, v10
	MTVRD(v1, r4)
	vspltb	v1, v1, 7
	cmpldi	r5, 64
	ble	L(tail64)
	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
	   Note: aligning to 64-byte will necessarily slow down performance for
	   strings around 64 bytes in length due to the extra comparisons
	   required to check alignment for the vectorized loop.  This is a
	   necessary tradeoff we are willing to take in order to speed up the
	   calculation for larger strings.  */
	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	/* In order to begin the 64B loop, it needs to be 64
	   bytes aligned.  So read until it is 64B aligned.  */
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16

	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16

	andi.	r11, r8, 63
	beq	cr0, L(preloop_64B)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	addi	r5, r5, -16
	/* At this point it should be 64B aligned.
	   Prepare for the 64B loop.  */
L(preloop_64B):
	cmpldi	r5, 64		/* Check if r5 < 64.  */
	ble	L(tail64)
	sub	r6, r10, r8
	srdi	r9, r6, 6	/* Number of loop iterations.  */
	mtctr	r9		/* Setup the counter.  */
	li	r11, 16		/* Load required offsets.  */
	li	r9, 32
	li	r7, 48

	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
	.align 4
L(loop):
	lvx	v2, 0, r8	/* Load 4 quadwords.  */
	lvx	v3, r8, r11
	lvx	v4, v8, r9
	lvx	v5, v8, r7
	vcmpequb	v6, v1, v2
	vcmpequb	v7, v1, v3
	vcmpequb	v8, v1, v4
	vcmpequb	v9, v1, v5
	vor	v11, v6, v7
	vor	v12, v8, v9
	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
	vcmpequb.	v11, v0, v11
	bnl	cr6, L(found)
	addi	r8, r8, 64	/* Adjust address for the next iteration.  */
	bdnz	L(loop)
	clrldi	r5, r6, 58

	/* Handle remainder of 64B loop or r5 > 64.  */
	.align	4
L(tail64):
	cmpldi	r5, 0
	beq	L(null)
	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	addi	r8, r8, 16
	cmpldi	cr6, r5, 16
	ble	cr6, L(null)
	addi	r5, r5, -16

	lvx	v4, 0, r8
	vcmpequb	v6, v1, v4
	vcmpequb.	v11, v0, v6
	bnl	cr6, L(found_16B)
	li	r3, 0
	blr

	/* Found a match in 64B loop.  */
	.align	4
L(found):
	/* Permute the first bit of each byte into bits 48-63.  */
	VBPERMQ(v6, v6, v10)
	VBPERMQ(v7, v7, v10)
	VBPERMQ(v8, v8, v10)
	VBPERMQ(v9, v9, v10)
	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	vsldoi	v7, v7, v7, 2
	vsldoi	v8, v8, v8, 4
	vsldoi	v9, v9, v9, 6
#else
	vsldoi	v6, v6, v6, 6
	vsldoi	v7, v7, v7, 4
	vsldoi	v8, v8, v8, 2
#endif
	/* Merge the results and move to a GPR.  */
	vor	v11, v6, v7
	vor	v4, v9, v8
	vor	v4, v11, v4
	MFVRD(r5, v4)
#ifdef __LITTLE_ENDIAN__
	addi	r6, r5, -1
	andc	r6, r6, r5
	popcntd	r6, r6
#else
	cntlzd	r6, r5	/* Count leading zeros before the match.  */
#endif
	add	r3, r8, r6	/* Compute final length.  */
	blr

	/* Found a match in last 16 bytes.  */
	.align	4
L(found_16B):
	/* Permute the first bit of each byte into bits 48-63.  */
	VBPERMQ(v6, v6, v10)
	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	MFVRD(r7, v6)
	addi	r6, r7, -1
	andc	r6, r6, r7
	popcntd	r6, r6
#else
	vsldoi	v6, v6, v6, 6
	MFVRD(r7, v6)
	cntlzd	r6, r7	/* Count leading zeros before the match.  */
#endif
	add	r3, r8, r6	/* Compute final length.  */
	cmpld	r6, r5
	bltlr
	li	r3, 0
	blr

	.align	4
	/* r3 has the output of the cmpb instruction, that is, it contains
	   0xff in the same position as BYTE in the original
	   doubleword from the string.  Use that to calculate the pointer.
	   We need to make sure BYTE is *before* the end of the range.  */
L(done):
#ifdef __LITTLE_ENDIAN__
	addi	r0, r3, -1
	andc	r0, r0, r3
	popcntd	r0, r0	      /* Count trailing zeros.  */
#else
	cntlzd	r0, r3	      /* Count leading zeros before the match.  */
#endif
	cmpld	r8, r7         /* Are we on the last dword?  */
	srdi	r0, r0, 3	/* Convert leading/trailing zeros to bytes.  */
	add	r3, r8, r0
	cmpld	cr7, r0, r6     /* If on the last dword, check byte offset.  */
	bnelr
	blelr	cr7
	li	r3, 0
	blr

	.align	4
L(null):
	li	r3, 0
	blr

/* Deals with size <= 32.  */
	.align	4
L(small_range):
	cmpldi	r5, 0
	beq	L(null)
	ld	r12, 0(r8)     /* Load word from memory.  */
	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
	and	r3, r3, r9
	cmpldi	cr7, r3, 0
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
	cmpld	r8, r7         /* Are we done already?  */
	bne	cr7, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)   /* Found something.  */
	beqlr		      /* Hit end of string (length).  */

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	cmpld	r8, r7
	bne	cr6, L(done)
	beqlr

	ldu	r12, 8(r8)
	cmpb	r3, r12, r4
	cmpldi	cr6, r3, 0
	bne	cr6, L(done)
	blr

END (MEMCHR)
weak_alias (__memchr, memchr)
libc_hidden_builtin_def (memchr)
Commit	Line	Data
43e0ac24	1	/* Optimized memchr implementation for POWER8.
688903eb	2	Copyright (C) 2017-2018 Free Software Foundation, Inc.
43e0ac24 RS	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#include <sysdep.h>
	20
	21	/* void [r3] memchr (const void s [r3], int c [r4], size_t n [r5]) */
	22
	23	/* TODO: change these to the actual instructions when the minimum required
	24	binutils allows it. */
	25	#define MTVRD(v, r) .long (0x7c000167 \| ((v)<<(32-11)) \| ((r)<<(32-16)))
	26	#define MFVRD(r, v) .long (0x7c000067 \| ((v)<<(32-11)) \| ((r)<<(32-16)))
	27	#define VBPERMQ(t, a, b) .long (0x1000054c \
	28	\| ((t)<<(32-11)) \
	29	\| ((a)<<(32-16)) \
	30	\| ((b)<<(32-21)) )
	31
	32	#ifndef MEMCHR
	33	# define MEMCHR __memchr
	34	#endif
	35	/* TODO: change this to .machine power8 when the minimum required binutils
	36	allows it. */
	37	.machine power7
	38	ENTRY_TOCLESS (MEMCHR)
	39	CALL_MCOUNT 3
	40	dcbt 0, r3
	41	clrrdi r8, r3, 3
	42	insrdi r4, r4, 8, 48
	43
	44	/* Calculate the last acceptable address and check for possible
	45	addition overflow by using satured math:
	46	r7 = r3 + r5
	47	r7 \|= -(r7 < x) */
	48	add r7, r3, r5
	49	subfc r6, r3, r7
	50	subfe r9, r9, r9
	51	extsw r6, r9
	52	or r7, r7, r6
	53
	54	insrdi r4, r4, 16, 32
	55	cmpldi r5, 32
	56	li r9, -1
	57	rlwinm r6, r3, 3, 26, 28 /* Calculate padding. */
	58	insrdi r4, r4, 32, 0
	59	mr r10, r7
	60	addi r7, r7, -1
	61	#ifdef __LITTLE_ENDIAN__
	62	sld r9, r9, r6
	63	#else
	64	srd r9, r9, r6
	65	#endif
	66	ble L(small_range)
67	andi. r11, r3, 63
68	beq cr0, L(align_qw)
69	clrldi r11, r3, 61
70	ld r12, 0(r8) /* Load doubleword from memory. */
71	cmpb r3, r12, r4 /* Check for BYTEs in DWORD1. */
72	and r3, r3, r9
73	clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
74	clrrdi r7, r7, 3 /* Address of last doubleword. */
75	cmpldi cr7, r3, 0 /* Does r3 indicate we got a hit? */
76	bne cr7, L(done)
77	addi r8, r8, 8
78	addi r5, r5, -8
79	add r5, r5, r11
80
81	/* Are we now aligned to a quadword boundary? */
82	andi. r11, r8, 15
83	beq cr0, L(align_qw)
84
85	/* Handle DWORD to make it QW aligned. */
86	ld r12, 0(r8)
87	cmpb r3, r12, r4
88	cmpldi cr7, r3, 0
89	bne cr7, L(done)
90	addi r5, r5, -8
91	addi r8, r8, 8
92	/* At this point, r8 is 16B aligned. */
93	L(align_qw):
94	vspltisb v0, 0
95	/* Precompute vbpermq constant. */
96	vspltisb v10, 3
97	li r0, 0
98	lvsl v11, r0, r0
99	vslb v10, v11, v10
100	MTVRD(v1, r4)
101	vspltb v1, v1, 7
102	cmpldi r5, 64
103	ble L(tail64)
104	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
105	Note: aligning to 64-byte will necessarily slow down performance for
106	strings around 64 bytes in length due to the extra comparisons
107	required to check alignment for the vectorized loop. This is a
108	necessary tradeoff we are willing to take in order to speed up the
109	calculation for larger strings. */
110	andi. r11, r8, 63
111	beq cr0, L(preloop_64B)
112	/* In order to begin the 64B loop, it needs to be 64
113	bytes aligned. So read until it is 64B aligned. */
114	lvx v4, 0, r8
115	vcmpequb v6, v1, v4
116	vcmpequb. v11, v0, v6
117	bnl cr6, L(found_16B)
118	addi r8, r8, 16
119	addi r5, r5, -16
120
121	andi. r11, r8, 63
122	beq cr0, L(preloop_64B)
123	lvx v4, 0, r8
124	vcmpequb v6, v1, v4
125	vcmpequb. v11, v0, v6
126	bnl cr6, L(found_16B)
127	addi r8, r8, 16
128	addi r5, r5, -16
129
130	andi. r11, r8, 63
131	beq cr0, L(preloop_64B)
132	lvx v4, 0, r8
133	vcmpequb v6, v1, v4
134	vcmpequb. v11, v0, v6
135	bnl cr6, L(found_16B)
136	addi r8, r8, 16
137	addi r5, r5, -16
138	/* At this point it should be 64B aligned.
139	Prepare for the 64B loop. */
140	L(preloop_64B):
141	cmpldi r5, 64 /* Check if r5 < 64. */
142	ble L(tail64)
143	sub r6, r10, r8
144	srdi r9, r6, 6 /* Number of loop iterations. */
145	mtctr r9 /* Setup the counter. */
146	li r11, 16 /* Load required offsets. */
147	li r9, 32
148	li r7, 48
149
150	/* Handle r5 > 64. Loop over the bytes in strides of 64B. */
151	.align 4
152	L(loop):
153	lvx v2, 0, r8 /* Load 4 quadwords. */
154	lvx v3, r8, r11
155	lvx v4, v8, r9
156	lvx v5, v8, r7
157	vcmpequb v6, v1, v2
158	vcmpequb v7, v1, v3
159	vcmpequb v8, v1, v4
160	vcmpequb v9, v1, v5
161	vor v11, v6, v7
162	vor v12, v8, v9
163	vor v11, v11, v12 /* Compare and merge into one VR for speed. */
164	vcmpequb. v11, v0, v11
165	bnl cr6, L(found)
166	addi r8, r8, 64 /* Adjust address for the next iteration. */
167	bdnz L(loop)
168	clrldi r5, r6, 58
169
170	/* Handle remainder of 64B loop or r5 > 64. */
171	.align 4
172	L(tail64):
173	cmpldi r5, 0
174	beq L(null)
175	lvx v4, 0, r8
176	vcmpequb v6, v1, v4
177	vcmpequb. v11, v0, v6
178	bnl cr6, L(found_16B)
179	addi r8, r8, 16
180	cmpldi cr6, r5, 16
181	ble cr6, L(null)
182	addi r5, r5, -16
183
184	lvx v4, 0, r8
185	vcmpequb v6, v1, v4
186	vcmpequb. v11, v0, v6
187	bnl cr6, L(found_16B)
188	addi r8, r8, 16
189	cmpldi cr6, r5, 16
190	ble cr6, L(null)
191	addi r5, r5, -16
192
193	lvx v4, 0, r8
194	vcmpequb v6, v1, v4
195	vcmpequb. v11, v0, v6
196	bnl cr6, L(found_16B)
197	addi r8, r8, 16
198	cmpldi cr6, r5, 16
199	ble cr6, L(null)
200	addi r5, r5, -16
201
202	lvx v4, 0, r8
203	vcmpequb v6, v1, v4
204	vcmpequb. v11, v0, v6
205	bnl cr6, L(found_16B)
206	li r3, 0
207	blr
208
209	/* Found a match in 64B loop. */
210	.align 4
211	L(found):
212	/* Permute the first bit of each byte into bits 48-63. */
213	VBPERMQ(v6, v6, v10)
214	VBPERMQ(v7, v7, v10)
215	VBPERMQ(v8, v8, v10)
216	VBPERMQ(v9, v9, v10)
217	/* Shift each component into its correct position for merging. */
218	#ifdef __LITTLE_ENDIAN__
219	vsldoi v7, v7, v7, 2
220	vsldoi v8, v8, v8, 4
221	vsldoi v9, v9, v9, 6
222	#else
223	vsldoi v6, v6, v6, 6
224	vsldoi v7, v7, v7, 4
225	vsldoi v8, v8, v8, 2
226	#endif
227	/* Merge the results and move to a GPR. */
228	vor v11, v6, v7
229	vor v4, v9, v8
230	vor v4, v11, v4
231	MFVRD(r5, v4)
232	#ifdef __LITTLE_ENDIAN__
233	addi r6, r5, -1
234	andc r6, r6, r5
235	popcntd r6, r6
236	#else
237	cntlzd r6, r5 /* Count leading zeros before the match. */
238	#endif
239	add r3, r8, r6 /* Compute final length. */
240	blr
241
242	/* Found a match in last 16 bytes. */
243	.align 4
244	L(found_16B):
245	/* Permute the first bit of each byte into bits 48-63. */
246	VBPERMQ(v6, v6, v10)
247	/* Shift each component into its correct position for merging. */
248	#ifdef __LITTLE_ENDIAN__
249	MFVRD(r7, v6)
250	addi r6, r7, -1
251	andc r6, r6, r7
252	popcntd r6, r6
253	#else
254	vsldoi v6, v6, v6, 6
255	MFVRD(r7, v6)
256	cntlzd r6, r7 /* Count leading zeros before the match. */
257	#endif
258	add r3, r8, r6 /* Compute final length. */
259	cmpld r6, r5
260	bltlr
261	li r3, 0
262	blr
263
264	.align 4
265	/* r3 has the output of the cmpb instruction, that is, it contains
266	0xff in the same position as BYTE in the original
267	doubleword from the string. Use that to calculate the pointer.
268	We need to make sure BYTE is before the end of the range. */
269	L(done):
270	#ifdef __LITTLE_ENDIAN__
271	addi r0, r3, -1
272	andc r0, r0, r3
273	popcntd r0, r0 /* Count trailing zeros. */
274	#else
275	cntlzd r0, r3 /* Count leading zeros before the match. */
276	#endif
277	cmpld r8, r7 /* Are we on the last dword? */
278	srdi r0, r0, 3 /* Convert leading/trailing zeros to bytes. */
279	add r3, r8, r0
280	cmpld cr7, r0, r6 /* If on the last dword, check byte offset. */
281	bnelr
282	blelr cr7
283	li r3, 0
284	blr
285
286	.align 4
287	L(null):
288	li r3, 0
289	blr
290
291	/* Deals with size <= 32. */
292	.align 4
293	L(small_range):
294	cmpldi r5, 0
295	beq L(null)
296	ld r12, 0(r8) /* Load word from memory. */
297	cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
298	and r3, r3, r9
299	cmpldi cr7, r3, 0
300	clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
301	clrrdi r7, r7, 3 /* Address of last doubleword. */
302	cmpld r8, r7 /* Are we done already? */
303	bne cr7, L(done)
304	beqlr
305
306	ldu r12, 8(r8)
307	cmpb r3, r12, r4
308	cmpldi cr6, r3, 0
309	cmpld r8, r7
310	bne cr6, L(done) /* Found something. */
311	beqlr /* Hit end of string (length). */
312
313	ldu r12, 8(r8)
314	cmpb r3, r12, r4
315	cmpldi cr6, r3, 0
316	cmpld r8, r7
317	bne cr6, L(done)
318	beqlr
319
320	ldu r12, 8(r8)
321	cmpb r3, r12, r4
322	cmpldi cr6, r3, 0
323	cmpld r8, r7
324	bne cr6, L(done)
325	beqlr
326
327	ldu r12, 8(r8)
328	cmpb r3, r12, r4
329	cmpldi cr6, r3, 0
330	bne cr6, L(done)
331	blr
332
333	END (MEMCHR)
334	weak_alias (__memchr, memchr)
335	libc_hidden_builtin_def (memchr)