[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power9 / strcmp.S

/* Optimized strcmp implementation for PowerPC64/POWER9.
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
#ifdef __LITTLE_ENDIAN__
#include <sysdep.h>

#ifndef STRCMP
# define STRCMP strcmp
#endif

/* Implements the function

   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])

   The implementation uses unaligned doubleword access for first 32 bytes
   as in POWER8 patch and uses vectorised loops after that.  */

/* TODO: Change this to actual instructions when minimum binutils is upgraded
   to 2.27. Macros are defined below for these newer instructions in order
   to maintain compatibility.  */
# define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))

# define VEXTUBRX(t,a,b) .long (0x1000070d \
				| ((t)<<(32-11))  \
				| ((a)<<(32-16))  \
				| ((b)<<(32-21)) )

# define VCMPNEZB(t,a,b) .long (0x10000507 \
				| ((t)<<(32-11))  \
				| ((a)<<(32-16))  \
				| ((b)<<(32-21)) )

/* Get 16 bytes for unaligned case.
   reg1: Vector to hold next 16 bytes.
   reg2: Address to read from.
   reg3: Permute control vector.  */
# define GET16BYTES(reg1, reg2, reg3) \
	lvx	reg1, 0, reg2; \
	vperm	v8, v2, reg1, reg3; \
	vcmpequb.	v8, v0, v8; \
	beq	cr6, 1f; \
	vspltisb	v9, 0; \
	b	2f; \
	.align 4; \
1: \
	addi    r6, reg2, 16; \
	lvx     v9, 0, r6; \
2: \
	vperm   reg1, v9, reg1, reg3;

/* TODO: change this to .machine power9 when the minimum required binutils
   allows it.  */

	.machine  power7
ENTRY_TOCLESS (STRCMP, 4)
	li	r0, 0

	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
	   the code:

	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))

	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */

	rldicl	r7, r3, 0, 52
	rldicl	r9, r4, 0, 52
	cmpldi	cr7, r7, 4096-16
	bgt	cr7, L(pagecross_check)
	cmpldi	cr5, r9, 4096-16
	bgt	cr5, L(pagecross_check)

	/* For short strings up to 16 bytes,  load both s1 and s2 using
	   unaligned dwords and compare.  */
	ld	r8, 0(r3)
	ld	r10, 0(r4)
	cmpb	r12, r8, r0
	cmpb	r11, r8, r10
	orc.	r9, r12, r11
	bne	cr0, L(different_nocmpb)

	ld	r8, 8(r3)
	ld	r10, 8(r4)
	cmpb	r12, r8, r0
	cmpb	r11, r8, r10
	orc.	r9, r12, r11
	bne	cr0, L(different_nocmpb)

	addi	r7, r3, 16
	addi	r4, r4, 16

L(align):
	/* Now it has checked for first 16 bytes.  */
	vspltisb	v0, 0
	vspltisb	v2, -1
	lvsr	v6, 0, r4   /* Compute mask.  */
	or	r5, r4, r7
	andi.	r5, r5, 0xF
	beq	cr0, L(aligned)
	andi.	r5, r7, 0xF
	beq	cr0, L(s1_align)
	lvsr	v10, 0, r7   /* Compute mask.  */

	/* Both s1 and s2 are unaligned.  */
	GET16BYTES(v4, r7, v10)
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	beq	cr6, L(match)
	b	L(different)

	/* Align s1 to qw and adjust s2 address.  */
	.align  4
L(match):
	clrldi	r6, r7, 60
	subfic	r5, r6, 16
	add	r7, r7, r5
	add	r4, r4, r5
	andi.	r5, r4, 0xF
	beq	cr0, L(aligned)
	lvsr	v6, 0, r4
	/* There are 2 loops depending on the input alignment.
	   Each loop gets 16 bytes from s1 and s2 and compares.
	   Loop until a mismatch or null occurs.  */
L(s1_align):
	lvx	v4, r7, r0
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, r7, r0
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, r7, r0
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, r7, r0
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	beq	cr6, L(s1_align)
	b	L(different)

	.align  4
L(aligned):
	lvx	v4, 0, r7
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, 0, r7
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, 0, r7
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	bne	cr6, L(different)

	lvx	v4, 0, r7
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	addi	r7, r7, 16
	addi	r4, r4, 16
	beq	cr6, L(aligned)

	/* Calculate and return the difference.  */
L(different):
	VCTZLSBB(r6, v7)
	VEXTUBRX(r5, r6, v4)
	VEXTUBRX(r4, r6, v5)
	subf	r3, r4, r5
	extsw	r3, r3
	blr

	.align  4
L(different_nocmpb):
	neg	r3, r9
	and	r9, r9, r3
	cntlzd	r9, r9
	subfic	r9, r9, 63
	srd	r3, r8, r9
	srd	r10, r10, r9
	rldicl	r10, r10, 0, 56
	rldicl	r3, r3, 0, 56
	subf	r3, r10, r3
	extsw	r3, r3
	blr

	.align	4
L(pagecross_check):
	subfic	r9, r9, 4096
	subfic	r7, r7, 4096
	cmpld	cr7, r7, r9
	bge	cr7, L(pagecross)
	mr	r7, r9

	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
	   a simple byte a byte comparison until the page alignment for s1
	   is reached.  */
L(pagecross):
	add	r7, r3, r7
	subf	r9, r3, r7
	mtctr	r9

	.align	4
L(pagecross_loop):
	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
	   and if *s1 is '\0'.  */
	lbz	r9, 0(r3)
	lbz	r10, 0(r4)
	addi	r3, r3, 1
	addi	r4, r4, 1
	cmplw	cr7, r9, r10
	cmpdi	cr5, r9, r0
	bne	cr7, L(pagecross_ne)
	beq	cr5, L(pagecross_nullfound)
	bdnz	L(pagecross_loop)
	b	L(align)

	.align	4
L(pagecross_ne):
	extsw	r3, r9
	mr	r9, r10
L(pagecross_retdiff):
	subf	r9, r9, r3
	extsw	r3, r9
	blr

	.align	4
L(pagecross_nullfound):
	li	r3, 0
	b	L(pagecross_retdiff)
END (STRCMP)
libc_hidden_builtin_def (strcmp)
#else
#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
#endif
Commit	Line	Data
80ab6401	1	/* Optimized strcmp implementation for PowerPC64/POWER9.
688903eb	2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
80ab6401 RS	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18	#ifdef __LITTLE_ENDIAN__
	19	#include <sysdep.h>
	20
3bc426e1 WSM	21	#ifndef STRCMP
	22	# define STRCMP strcmp
	23	#endif
	24
80ab6401 RS	25	/* Implements the function
	26
	27	int [r3] strcmp (const char s1 [r3], const char s2 [r4])
	28
	29	The implementation uses unaligned doubleword access for first 32 bytes
	30	as in POWER8 patch and uses vectorised loops after that. */
	31
	32	/* TODO: Change this to actual instructions when minimum binutils is upgraded
	33	to 2.27. Macros are defined below for these newer instructions in order
	34	to maintain compatibility. */
	35	# define VCTZLSBB(r,v) .long (0x10010602 \| ((r)<<(32-11)) \| ((v)<<(32-21)))
	36
	37	# define VEXTUBRX(t,a,b) .long (0x1000070d \
	38	\| ((t)<<(32-11)) \
	39	\| ((a)<<(32-16)) \
	40	\| ((b)<<(32-21)) )
	41
	42	# define VCMPNEZB(t,a,b) .long (0x10000507 \
	43	\| ((t)<<(32-11)) \
	44	\| ((a)<<(32-16)) \
	45	\| ((b)<<(32-21)) )
	46
	47	/* Get 16 bytes for unaligned case.
	48	reg1: Vector to hold next 16 bytes.
	49	reg2: Address to read from.
	50	reg3: Permute control vector. */
	51	# define GET16BYTES(reg1, reg2, reg3) \
	52	lvx reg1, 0, reg2; \
	53	vperm v8, v2, reg1, reg3; \
	54	vcmpequb. v8, v0, v8; \
	55	beq cr6, 1f; \
	56	vspltisb v9, 0; \
	57	b 2f; \
	58	.align 4; \
	59	1: \
	60	addi r6, reg2, 16; \
	61	lvx v9, 0, r6; \
	62	2: \
	63	vperm reg1, v9, reg1, reg3;
	64
	65	/* TODO: change this to .machine power9 when the minimum required binutils
	66	allows it. */
	67
	68	.machine power7
d5b41185	69	ENTRY_TOCLESS (STRCMP, 4)
80ab6401 RS	70	li r0, 0
80ab6401 RS	71
04f0fd64	72	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
80ab6401 RS	73	the code:
	74
	75	(((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
	76
04f0fd64	77	with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
80ab6401 RS	78
	79	rldicl r7, r3, 0, 52
	80	rldicl r9, r4, 0, 52
04f0fd64	81	cmpldi cr7, r7, 4096-16
80ab6401	82	bgt cr7, L(pagecross_check)
04f0fd64	83	cmpldi cr5, r9, 4096-16
80ab6401 RS	84	bgt cr5, L(pagecross_check)
80ab6401 RS	85
04f0fd64	86	/* For short strings up to 16 bytes, load both s1 and s2 using
80ab6401 RS	87	unaligned dwords and compare. */
	88	ld r8, 0(r3)
	89	ld r10, 0(r4)
	90	cmpb r12, r8, r0
	91	cmpb r11, r8, r10
	92	orc. r9, r12, r11
	93	bne cr0, L(different_nocmpb)
	94
	95	ld r8, 8(r3)
	96	ld r10, 8(r4)
	97	cmpb r12, r8, r0
	98	cmpb r11, r8, r10
	99	orc. r9, r12, r11
	100	bne cr0, L(different_nocmpb)
	101
04f0fd64 RS	102	addi r7, r3, 16
04f0fd64 RS	103	addi r4, r4, 16
80ab6401 RS	104
80ab6401 RS	105	L(align):
04f0fd64	106	/* Now it has checked for first 16 bytes. */
80ab6401 RS	107	vspltisb v0, 0
	108	vspltisb v2, -1
	109	lvsr v6, 0, r4 /* Compute mask. */
	110	or r5, r4, r7
	111	andi. r5, r5, 0xF
	112	beq cr0, L(aligned)
	113	andi. r5, r7, 0xF
	114	beq cr0, L(s1_align)
	115	lvsr v10, 0, r7 /* Compute mask. */
	116
	117	/* Both s1 and s2 are unaligned. */
	118	GET16BYTES(v4, r7, v10)
	119	GET16BYTES(v5, r4, v6)
	120	VCMPNEZB(v7, v5, v4)
	121	beq cr6, L(match)
	122	b L(different)
	123
	124	/* Align s1 to qw and adjust s2 address. */
	125	.align 4
	126	L(match):
	127	clrldi r6, r7, 60
	128	subfic r5, r6, 16
	129	add r7, r7, r5
	130	add r4, r4, r5
	131	andi. r5, r4, 0xF
	132	beq cr0, L(aligned)
	133	lvsr v6, 0, r4
	134	/* There are 2 loops depending on the input alignment.
	135	Each loop gets 16 bytes from s1 and s2 and compares.
	136	Loop until a mismatch or null occurs. */
	137	L(s1_align):
	138	lvx v4, r7, r0
	139	GET16BYTES(v5, r4, v6)
	140	VCMPNEZB(v7, v5, v4)
	141	addi r7, r7, 16
	142	addi r4, r4, 16
	143	bne cr6, L(different)
	144
	145	lvx v4, r7, r0
	146	GET16BYTES(v5, r4, v6)
	147	VCMPNEZB(v7, v5, v4)
	148	addi r7, r7, 16
	149	addi r4, r4, 16
	150	bne cr6, L(different)
	151
	152	lvx v4, r7, r0
	153	GET16BYTES(v5, r4, v6)
	154	VCMPNEZB(v7, v5, v4)
	155	addi r7, r7, 16
	156	addi r4, r4, 16
	157	bne cr6, L(different)
	158
	159	lvx v4, r7, r0
	160	GET16BYTES(v5, r4, v6)
	161	VCMPNEZB(v7, v5, v4)
	162	addi r7, r7, 16
	163	addi r4, r4, 16
	164	beq cr6, L(s1_align)
	165	b L(different)
	166
	167	.align 4
	168	L(aligned):
	169	lvx v4, 0, r7
	170	lvx v5, 0, r4
171	VCMPNEZB(v7, v5, v4)
172	addi r7, r7, 16
173	addi r4, r4, 16
174	bne cr6, L(different)
175
176	lvx v4, 0, r7
177	lvx v5, 0, r4
178	VCMPNEZB(v7, v5, v4)
179	addi r7, r7, 16
180	addi r4, r4, 16
181	bne cr6, L(different)
182
183	lvx v4, 0, r7
184	lvx v5, 0, r4
185	VCMPNEZB(v7, v5, v4)
186	addi r7, r7, 16
187	addi r4, r4, 16
188	bne cr6, L(different)
189
190	lvx v4, 0, r7
191	lvx v5, 0, r4
192	VCMPNEZB(v7, v5, v4)
193	addi r7, r7, 16
194	addi r4, r4, 16
195	beq cr6, L(aligned)
196
197	/* Calculate and return the difference. */
198	L(different):
199	VCTZLSBB(r6, v7)
200	VEXTUBRX(r5, r6, v4)
201	VEXTUBRX(r4, r6, v5)
202	subf r3, r4, r5
203	extsw r3, r3
204	blr
205
206	.align 4
207	L(different_nocmpb):
208	neg r3, r9
209	and r9, r9, r3
210	cntlzd r9, r9
211	subfic r9, r9, 63
212	srd r3, r8, r9
213	srd r10, r10, r9
214	rldicl r10, r10, 0, 56
215	rldicl r3, r3, 0, 56
216	subf r3, r10, r3
217	extsw r3, r3
218	blr
219
220	.align 4
221	L(pagecross_check):
222	subfic r9, r9, 4096
223	subfic r7, r7, 4096
224	cmpld cr7, r7, r9
225	bge cr7, L(pagecross)
226	mr r7, r9
227
228	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
229	a simple byte a byte comparison until the page alignment for s1
230	is reached. */
231	L(pagecross):
232	add r7, r3, r7
233	subf r9, r3, r7
234	mtctr r9
235
236	.align 4
237	L(pagecross_loop):
238	/* Loads a byte from s1 and s2, compare if s1 is equal to s2
239	and if s1 is '\0'. /
240	lbz r9, 0(r3)
241	lbz r10, 0(r4)
242	addi r3, r3, 1
243	addi r4, r4, 1
244	cmplw cr7, r9, r10
245	cmpdi cr5, r9, r0
246	bne cr7, L(pagecross_ne)
247	beq cr5, L(pagecross_nullfound)
248	bdnz L(pagecross_loop)
249	b L(align)
250
251	.align 4
252	L(pagecross_ne):
253	extsw r3, r9
254	mr r9, r10
255	L(pagecross_retdiff):
256	subf r9, r9, r3
257	extsw r3, r9
258	blr
259
260	.align 4
261	L(pagecross_nullfound):
262	li r3, 0
263	b L(pagecross_retdiff)
3bc426e1	264	END (STRCMP)
80ab6401 RS	265	libc_hidden_builtin_def (strcmp)
	266	#else
	267	#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
	268	#endif