[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strlen.S

/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
   loop.
   Copyright (C) 2016-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* int [r3] strlen (char *s [r3])  */

#ifndef STRLEN
# define STRLEN strlen
#endif
	.machine  power8
ENTRY_TOCLESS (STRLEN, 4)
	CALL_MCOUNT 1
	dcbt	0,r3
	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
	li	r0,0	      /* Doubleword with null chars to use
				 with cmpb.  */
	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
	ld	r12,0(r4)     /* Load doubleword from memory.  */
#ifdef __LITTLE_ENDIAN__
	sld	r5,r5,r6
#else
	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
#endif
	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
	bne	cr7,L(done)

	/* For shorter strings (< 64 bytes), we will not use vector registers,
	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
	   will be done the same way as we do in the POWER7 implementation.
	   Let's see if we are aligned to a quadword boundary.  If so, we can
	   jump to the first (non-vectorized) loop.  Otherwise, we have to
	   handle the next DWORD first.  */
	mtcrf	0x01,r4
	mr	r9,r4
	addi	r9,r9,8
	bt	28,L(align64)

	/* Handle the next 8 bytes so we are aligned to a quadword
	   boundary.  */
	ldu	r5,8(r4)
	cmpb	r10,r5,r0
	cmpdi	cr7,r10,0
	addi	r9,r9,8
	bne	cr7,L(done)

L(align64):
	/* Proceed to the old (POWER7) implementation, checking two doublewords
	   per iteraction.  For the first 56 bytes, we will just check for null
	   characters.  After that, we will also check if we are 64-byte aligned
	   so we can jump to the vectorized implementation.  We will unroll
	   these loops to avoid excessive branching.  */
	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16
	bne	cr7,L(dword_zero)

	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16
	bne	cr7,L(dword_zero)

	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16
	bne	cr7,L(dword_zero)

	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
	   Note: aligning to 64-byte will necessarily slow down performance for
	   strings around 64 bytes in length due to the extra comparisons
	   required to check alignment for the vectorized loop.  This is a
	   necessary tradeoff we are willing to take in order to speed up the
	   calculation for larger strings.  */
	andi.	r10,r9,63
	beq	cr0,L(preloop)
	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16
	bne	cr7,L(dword_zero)

	andi.	r10,r9,63
	beq	cr0,L(preloop)
	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16
	bne	cr7,L(dword_zero)

	andi.	r10,r9,63
	beq	cr0,L(preloop)
	ld	r6,8(r4)
	ldu	r5,16(r4)
	cmpb	r10,r6,r0
	cmpb	r11,r5,r0
	or	r5,r10,r11
	cmpdi	cr7,r5,0
	addi	r9,r9,16

	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
	   found, jump to the vectorized loop.  */
	beq	cr7,L(preloop)

L(dword_zero):
	/* OK, one (or both) of the doublewords contains a null byte.  Check
	   the first doubleword and decrement the address in case the first
	   doubleword really contains a null byte.  */

	cmpdi	cr6,r10,0
	addi	r4,r4,-8
	bne	cr6,L(done)

	/* The null byte must be in the second doubleword.  Adjust the address
	   again and move the result of cmpb to r10 so we can calculate the
	   length.  */

	mr	r10,r11
	addi	r4,r4,8

	/* If the null byte was found in the non-vectorized code, compute the
	   final length.  r10 has the output of the cmpb instruction, that is,
	   it contains 0xff in the same position as the null byte in the
	   original doubleword from the string.  Use that to calculate the
	   length.  */
L(done):
#ifdef __LITTLE_ENDIAN__
	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
	andc	r9, r9,r10
	popcntd	r0, r9	      /* Count the bits in the mask.  */
#else
	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
#endif
	subf	r5,r3,r4
	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
	add	r3,r5,r0      /* Compute final length.  */
	blr

	/* Vectorized implementation starts here.  */
	.p2align  4
L(preloop):
	/* Set up for the loop.  */
	mr	r4,r9
	li	r7, 16	      /* Load required offsets.  */
	li	r8, 32
	li	r9, 48
	li	r12, 8
	vxor	v0,v0,v0      /* VR with null chars to use with
				 vcmpequb.  */

	/* Main loop to look for the end of the string.  We will read in
	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
	   leverage the icache performance.  */
	.p2align  5
L(loop):
	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
	lvx	  v2,r4,r7
	lvx	  v3,r4,r8
	lvx	  v4,r4,r9
	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
	vminub	  v6,v3,v4
	vminub	  v7,v5,v6
	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
	bne	  cr6,L(vmx_zero)

	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
	lvx	  v2,r4,r7
	lvx	  v3,r4,r8
	lvx	  v4,r4,r9
	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
	vminub	  v6,v3,v4
	vminub	  v7,v5,v6
	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
	bne	  cr6,L(vmx_zero)

	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
	lvx	  v2,r4,r7
	lvx	  v3,r4,r8
	lvx	  v4,r4,r9
	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
	vminub	  v6,v3,v4
	vminub	  v7,v5,v6
	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
	beq	  cr6,L(loop)

L(vmx_zero):
	/* OK, we found a null byte.  Let's look for it in the current 64-byte
	   block and mark it in its corresponding VR.  */
	vcmpequb  v1,v1,v0
	vcmpequb  v2,v2,v0
	vcmpequb  v3,v3,v0
	vcmpequb  v4,v4,v0

	/* We will now 'compress' the result into a single doubleword, so it
	   can be moved to a GPR for the final calculation.  First, we
	   generate an appropriate mask for vbpermq, so we can permute bits into
	   the first halfword.  */
	vspltisb  v10,3
	lvsl	  v11,r0,r0
	vslb	  v10,v11,v10

	/* Permute the first bit of each byte into bits 48-63.  */
	vbpermq	v1,v1,v10
	vbpermq	v2,v2,v10
	vbpermq	v3,v3,v10
	vbpermq	v4,v4,v10

	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	vsldoi  v2,v2,v2,2
	vsldoi  v3,v3,v3,4
	vsldoi  v4,v4,v4,6
#else
	vsldoi	v1,v1,v1,6
	vsldoi	v2,v2,v2,4
	vsldoi	v3,v3,v3,2
#endif

	/* Merge the results and move to a GPR.  */
	vor	v1,v2,v1
	vor	v2,v3,v4
	vor	v4,v1,v2
	mfvrd	r10,v4

	 /* Adjust address to the begninning of the current 64-byte block.  */
	addi	r4,r4,-64

#ifdef __LITTLE_ENDIAN__
	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
	andc	r9, r9,r10
	popcntd	r0, r9	      /* Count the bits in the mask.  */
#else
	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
#endif
	subf	r5,r3,r4
	add	r3,r5,r0      /* Compute final length.  */
	blr

END (STRLEN)
libc_hidden_builtin_def (strlen)
Commit	Line	Data
1b045ee5 CES	1	/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
1b045ee5 CES	2	loop.
04277e02	3	Copyright (C) 2016-2019 Free Software Foundation, Inc.
1b045ee5 CES	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
	18	<http://www.gnu.org/licenses/>. */
	19
	20	#include <sysdep.h>
	21
1b045ee5 CES	22	/* int [r3] strlen (char s [r3]) /
1b045ee5 CES	23
001649fd WSM	24	#ifndef STRLEN
	25	# define STRLEN strlen
	26	#endif
066020c5	27	.machine power8
d5b41185	28	ENTRY_TOCLESS (STRLEN, 4)
1b045ee5 CES	29	CALL_MCOUNT 1
	30	dcbt 0,r3
	31	clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
	32	rlwinm r6,r3,3,26,28 /* Calculate padding. */
	33	li r0,0 /* Doubleword with null chars to use
	34	with cmpb. */
	35	li r5,-1 /* MASK = 0xffffffffffffffff. */
	36	ld r12,0(r4) /* Load doubleword from memory. */
	37	#ifdef __LITTLE_ENDIAN__
	38	sld r5,r5,r6
	39	#else
	40	srd r5,r5,r6 /* MASK = MASK >> padding. */
	41	#endif
	42	orc r9,r12,r5 /* Mask bits that are not part of the string. */
	43	cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
	44	cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
	45	bne cr7,L(done)
	46
	47	/* For shorter strings (< 64 bytes), we will not use vector registers,
	48	as the overhead isn't worth it. So, let's use GPRs instead. This
	49	will be done the same way as we do in the POWER7 implementation.
	50	Let's see if we are aligned to a quadword boundary. If so, we can
	51	jump to the first (non-vectorized) loop. Otherwise, we have to
	52	handle the next DWORD first. */
	53	mtcrf 0x01,r4
	54	mr r9,r4
	55	addi r9,r9,8
	56	bt 28,L(align64)
	57
	58	/* Handle the next 8 bytes so we are aligned to a quadword
	59	boundary. */
	60	ldu r5,8(r4)
	61	cmpb r10,r5,r0
	62	cmpdi cr7,r10,0
	63	addi r9,r9,8
	64	bne cr7,L(done)
	65
	66	L(align64):
	67	/* Proceed to the old (POWER7) implementation, checking two doublewords
	68	per iteraction. For the first 56 bytes, we will just check for null
	69	characters. After that, we will also check if we are 64-byte aligned
	70	so we can jump to the vectorized implementation. We will unroll
	71	these loops to avoid excessive branching. */
	72	ld r6,8(r4)
	73	ldu r5,16(r4)
	74	cmpb r10,r6,r0
	75	cmpb r11,r5,r0
	76	or r5,r10,r11
	77	cmpdi cr7,r5,0
	78	addi r9,r9,16
	79	bne cr7,L(dword_zero)
	80
	81	ld r6,8(r4)
	82	ldu r5,16(r4)
	83	cmpb r10,r6,r0
	84	cmpb r11,r5,r0
	85	or r5,r10,r11
	86	cmpdi cr7,r5,0
	87	addi r9,r9,16
	88	bne cr7,L(dword_zero)
	89
	90	ld r6,8(r4)
	91	ldu r5,16(r4)
	92	cmpb r10,r6,r0
93	cmpb r11,r5,r0
94	or r5,r10,r11
95	cmpdi cr7,r5,0
96	addi r9,r9,16
97	bne cr7,L(dword_zero)
98
99	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
100	Note: aligning to 64-byte will necessarily slow down performance for
101	strings around 64 bytes in length due to the extra comparisons
102	required to check alignment for the vectorized loop. This is a
103	necessary tradeoff we are willing to take in order to speed up the
104	calculation for larger strings. */
105	andi. r10,r9,63
106	beq cr0,L(preloop)
107	ld r6,8(r4)
108	ldu r5,16(r4)
109	cmpb r10,r6,r0
110	cmpb r11,r5,r0
111	or r5,r10,r11
112	cmpdi cr7,r5,0
113	addi r9,r9,16
114	bne cr7,L(dword_zero)
115
116	andi. r10,r9,63
117	beq cr0,L(preloop)
118	ld r6,8(r4)
119	ldu r5,16(r4)
120	cmpb r10,r6,r0
121	cmpb r11,r5,r0
122	or r5,r10,r11
123	cmpdi cr7,r5,0
124	addi r9,r9,16
125	bne cr7,L(dword_zero)
126
1b045ee5 CES	127	andi. r10,r9,63
	128	beq cr0,L(preloop)
	129	ld r6,8(r4)
	130	ldu r5,16(r4)
	131	cmpb r10,r6,r0
	132	cmpb r11,r5,r0
	133	or r5,r10,r11
	134	cmpdi cr7,r5,0
	135	addi r9,r9,16
	136
	137	/* At this point, we are necessarily 64-byte aligned. If no zeroes were
	138	found, jump to the vectorized loop. */
	139	beq cr7,L(preloop)
	140
	141	L(dword_zero):
	142	/* OK, one (or both) of the doublewords contains a null byte. Check
	143	the first doubleword and decrement the address in case the first
	144	doubleword really contains a null byte. */
	145
	146	cmpdi cr6,r10,0
	147	addi r4,r4,-8
	148	bne cr6,L(done)
	149
	150	/* The null byte must be in the second doubleword. Adjust the address
	151	again and move the result of cmpb to r10 so we can calculate the
	152	length. */
	153
	154	mr r10,r11
	155	addi r4,r4,8
	156
	157	/* If the null byte was found in the non-vectorized code, compute the
	158	final length. r10 has the output of the cmpb instruction, that is,
	159	it contains 0xff in the same position as the null byte in the
	160	original doubleword from the string. Use that to calculate the
	161	length. */
	162	L(done):
	163	#ifdef __LITTLE_ENDIAN__
	164	addi r9, r10,-1 /* Form a mask from trailing zeros. */
	165	andc r9, r9,r10
	166	popcntd r0, r9 /* Count the bits in the mask. */
	167	#else
	168	cntlzd r0,r10 /* Count leading zeros before the match. */
	169	#endif
	170	subf r5,r3,r4
	171	srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
	172	add r3,r5,r0 /* Compute final length. */
	173	blr
	174
	175	/* Vectorized implementation starts here. */
	176	.p2align 4
	177	L(preloop):
	178	/* Set up for the loop. */
	179	mr r4,r9
	180	li r7, 16 /* Load required offsets. */
	181	li r8, 32
	182	li r9, 48
	183	li r12, 8
	184	vxor v0,v0,v0 /* VR with null chars to use with
	185	vcmpequb. */
	186
	187	/* Main loop to look for the end of the string. We will read in
	188	64-byte chunks. Align it to 32 bytes and unroll it 3 times to
	189	leverage the icache performance. */
	190	.p2align 5
191	L(loop):
192	lvx v1,r4,r0 /* Load 4 quadwords. */
193	lvx v2,r4,r7
194	lvx v3,r4,r8
195	lvx v4,r4,r9
196	vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
197	vminub v6,v3,v4
198	vminub v7,v5,v6
199	vcmpequb. v7,v7,v0 /* Check for NULLs. */
200	addi r4,r4,64 /* Adjust address for the next iteration. */
201	bne cr6,L(vmx_zero)
202
203	lvx v1,r4,r0 /* Load 4 quadwords. */
204	lvx v2,r4,r7
205	lvx v3,r4,r8
206	lvx v4,r4,r9
207	vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
208	vminub v6,v3,v4
209	vminub v7,v5,v6
210	vcmpequb. v7,v7,v0 /* Check for NULLs. */
211	addi r4,r4,64 /* Adjust address for the next iteration. */
212	bne cr6,L(vmx_zero)
213
214	lvx v1,r4,r0 /* Load 4 quadwords. */
215	lvx v2,r4,r7
216	lvx v3,r4,r8
217	lvx v4,r4,r9
218	vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
219	vminub v6,v3,v4
220	vminub v7,v5,v6
221	vcmpequb. v7,v7,v0 /* Check for NULLs. */
222	addi r4,r4,64 /* Adjust address for the next iteration. */
223	beq cr6,L(loop)
224
225	L(vmx_zero):
226	/* OK, we found a null byte. Let's look for it in the current 64-byte
227	block and mark it in its corresponding VR. */
228	vcmpequb v1,v1,v0
229	vcmpequb v2,v2,v0
230	vcmpequb v3,v3,v0
231	vcmpequb v4,v4,v0
232
233	/* We will now 'compress' the result into a single doubleword, so it
234	can be moved to a GPR for the final calculation. First, we
235	generate an appropriate mask for vbpermq, so we can permute bits into
236	the first halfword. */
237	vspltisb v10,3
238	lvsl v11,r0,r0
239	vslb v10,v11,v10
240
241	/* Permute the first bit of each byte into bits 48-63. */
066020c5 RFF	242	vbpermq v1,v1,v10
	243	vbpermq v2,v2,v10
	244	vbpermq v3,v3,v10
	245	vbpermq v4,v4,v10
1b045ee5 CES	246
	247	/* Shift each component into its correct position for merging. */
	248	#ifdef __LITTLE_ENDIAN__
	249	vsldoi v2,v2,v2,2
	250	vsldoi v3,v3,v3,4
	251	vsldoi v4,v4,v4,6
	252	#else
	253	vsldoi v1,v1,v1,6
	254	vsldoi v2,v2,v2,4
	255	vsldoi v3,v3,v3,2
	256	#endif
	257
	258	/* Merge the results and move to a GPR. */
	259	vor v1,v2,v1
	260	vor v2,v3,v4
	261	vor v4,v1,v2
066020c5	262	mfvrd r10,v4
1b045ee5 CES	263
	264	/* Adjust address to the begninning of the current 64-byte block. */
	265	addi r4,r4,-64
	266
	267	#ifdef __LITTLE_ENDIAN__
	268	addi r9, r10,-1 /* Form a mask from trailing zeros. */
	269	andc r9, r9,r10
	270	popcntd r0, r9 /* Count the bits in the mask. */
	271	#else
	272	cntlzd r0,r10 /* Count leading zeros before the match. */
	273	#endif
	274	subf r5,r3,r4
	275	add r3,r5,r0 /* Compute final length. */
	276	blr
	277
001649fd	278	END (STRLEN)
1b045ee5	279	libc_hidden_builtin_def (strlen)