[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power9 / strncmp.S

/* Optimized strncmp implementation for PowerPC64/POWER9.
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
#include <sysdep.h>

/* Implements the function

   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)

   The implementation uses unaligned doubleword access to avoid specialized
   code paths depending of data alignment for first 32 bytes and uses
   vectorised loops after that.  */

#ifndef STRNCMP
# define STRNCMP strncmp
#endif

/* TODO: Change this to actual instructions when minimum binutils is upgraded
   to 2.27.  Macros are defined below for these newer instructions in order
   to maintain compatibility.  */
#define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))

#define VEXTUBRX(t,a,b) .long (0x1000070d \
				| ((t)<<(32-11))  \
				| ((a)<<(32-16))  \
				| ((b)<<(32-21)) )

#define VCMPNEZB(t,a,b) .long (0x10000507 \
				| ((t)<<(32-11))  \
				| ((a)<<(32-16))  \
				| ((b)<<(32-21)) )

/* Get 16 bytes for unaligned case.
   reg1: Vector to hold next 16 bytes.
   reg2: Address to read from.
   reg3: Permute control vector.  */
#define GET16BYTES(reg1, reg2, reg3) \
	lvx	reg1, 0, reg2; \
	vperm	v8, v2, reg1, reg3; \
	vcmpequb.	v8, v0, v8; \
	beq	cr6, 1f; \
	vspltisb	v9, 0; \
	b	2f; \
	.align 4; \
1: \
	cmplw	cr6, r5, r11; \
	ble	cr6, 2f; \
	addi	r6, reg2, 16; \
	lvx	v9, 0, r6; \
2: \
	vperm	reg1, v9, reg1, reg3;

/* TODO: change this to .machine power9 when minimum binutils
   is upgraded to 2.27.  */
	.machine  power7
ENTRY_TOCLESS (STRNCMP, 4)
	/* Check if size is 0.  */
	cmpdi	cr0, r5, 0
	beq	cr0, L(ret0)
	li	r0, 0

	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
	   the code:

	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))

	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
	rldicl	r8, r3, 0, 52
	cmpldi	cr7, r8, 4096-32
	bgt	cr7, L(pagecross)
	rldicl	r9, r4, 0, 52
	cmpldi	cr7, r9, 4096-32
	bgt	cr7, L(pagecross)

	/* For short strings up to 32 bytes, load both s1 and s2 using
	   unaligned dwords and compare.  */

	ld	r7, 0(r3)
	ld	r9, 0(r4)
	li	r8, 0
	cmpb	r8, r7, r8
	cmpb	r6, r7, r9
	orc.	r8, r8, r6
	bne	cr0, L(different1)

	/* If the strings compared are equal, but size is less or equal
	   to 8, return 0.  */
	cmpldi	cr7, r5, 8
	li	r9, 0
	ble	cr7, L(ret1)
	addi	r5, r5, -8

	ld	r7, 8(r3)
	ld	r9, 8(r4)
	cmpb	r8, r7, r8
	cmpb	r6, r7, r9
	orc.	r8, r8, r6
	bne	cr0, L(different1)
	cmpldi	cr7, r5, 8
	mr	r9, r8
	ble	cr7, L(ret1)
	/* Update pointers and size.  */
	addi	r5, r5, -8
	addi	r3, r3, 16
	addi	r4, r4, 16

	ld	r7, 0(r3)
	ld	r9, 0(r4)
	li	r8, 0
	cmpb	r8, r7, r8
	cmpb	r6, r7, r9
	orc.	r8, r8, r6
	bne	cr0, L(different1)
	cmpldi	cr7, r5, 8
	li	r9, 0
	ble	cr7, L(ret1)
	addi	r5, r5, -8

	ld	r7, 8(r3)
	ld	r9, 8(r4)
	cmpb	r8, r7, r8
	cmpb	r6, r7, r9
	orc.	r8, r8, r6
	bne	cr0, L(different1)
	cmpldi	cr7, r5, 8
	mr	r9, r8
	ble	cr7, L(ret1)

	/* Update pointers and size.  */
	addi	r5, r5, -8
	addi	r3, r3, 16
	addi	r4, r4, 16
L(align):
	/* Now it has checked for first 32 bytes, align source1 to doubleword
	   and adjust source2 address.  */
	vspltisb	v0, 0
	vspltisb	v2, -1
	or	r6, r4, r3
	andi.	r6, r6, 0xF
	beq	cr0, L(aligned)
	lvsr	v6, 0, r4   /* Compute mask.  */
	clrldi	r6, r4, 60
	subfic	r11, r6, 16
	andi.	r6, r3, 0xF
	beq	cr0, L(s1_align)
	/* Both s1 and s2 are unaligned.  */
	GET16BYTES(v5, r4, v6)
	lvsr	v10, 0, r3   /* Compute mask.  */
	clrldi	r6, r3, 60
	subfic	r11, r6, 16
	GET16BYTES(v4, r3, v10)
	VCMPNEZB(v7, v5, v4)
	beq	cr6, L(match)
	b	L(different)

	/* Align s1 to qw and adjust s2 address.  */
	.align  4
L(match):
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	subf	r5, r11, r5
	add	r3, r3, r11
	add	r4, r4, r11
	andi.	r11, r4, 0xF
	beq	cr0, L(aligned)
	lvsr	v6, 0, r4
	clrldi	r6, r4, 60
	subfic	r11, r6, 16
	/* There are 2 loops depending on the input alignment.
	   Each loop gets 16 bytes from s1 and s2, checks for null
	   and compares them. Loops until a mismatch or  null occurs.  */
L(s1_align):
	lvx	v4, 0, r3
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	GET16BYTES(v5, r4, v6)
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16
	b	L(s1_align)
	.align  4
L(aligned):
	lvx	v4, 0, r3
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16

	lvx	v4, 0, r3
	lvx	v5, 0, r4
	VCMPNEZB(v7, v5, v4)
	bne	cr6, L(different)
	cmpldi	cr7, r5, 16
	ble	cr7, L(ret0)
	addi	r5, r5, -16
	addi	r3, r3, 16
	addi	r4, r4, 16
	b	L(aligned)
	/* Calculate and return the difference.  */
L(different):
	VCTZLSBB(r6, v7)
	cmplw	cr7, r5, r6
	ble	cr7, L(ret0)
	VEXTUBRX(r5, r6, v4)
	VEXTUBRX(r4, r6, v5)
	subf	r3, r4, r5
	extsw	r3, r3
	blr

	.align 4
L(ret0):
	li	r9, 0
L(ret1):
	mr	r3, r9
	blr

	/* The code now checks if r8 and r5 are different by issuing a
	   cmpb and shifts the result based on its output:

	  leadzero = (__builtin_ffsl (z1) - 1);
	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
	  r1 = (r1 >> leadzero) & 0xFFUL;
	  r2 = (r2 >> leadzero) & 0xFFUL;
	  return r1 - r2;  */

	.align 4
L(different1):
	neg	r11, r8
	sldi	r5, r5, 3
	and	r8, r11, r8
	addi	r5, r5, -8
	cntlzd	r8, r8
	subfic	r8, r8, 63
	extsw 	r8, r8
	cmpld	cr7, r8, r5
	ble	cr7, L(different2)
	mr	r8, r5
L(different2):
	extsw	r8, r8
	srd	r7, r7, r8
	srd	r9, r9, r8
	rldicl	r3, r7, 0, 56
	rldicl	r9, r9, 0, 56
	subf	r9, r9, 3
	extsw	r9, r9
	mr	r3, r9
	blr

	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
	   a simple byte a byte comparison until the page alignment for s1
	   is reached.  */
	.align 4
L(pagecross):
	lbz	r7, 0(r3)
	lbz	r9, 0(r4)
	subfic	r8, r8,4095
	cmplw	cr7, r9, r7
	bne	cr7, L(byte_ne_3)
	cmpdi	cr7, r9, 0
	beq	cr7, L(byte_ne_0)
	addi	r5, r5, -1
	subf	r7, r8, r5
	subf	r9, r7, r5
	addi	r9, r9, 1
	mtctr	r9
	b	L(pagecross_loop1)

	.align 4
L(pagecross_loop0):
	beq	cr7, L(ret0)
	lbz	r9, 0(r3)
	lbz	r8, 0(r4)
	addi	r5, r5, -1
	cmplw	cr7, r9, r8
	cmpdi	cr5, r9, 0
	bne	cr7, L(byte_ne_2)
	beq	cr5, L(byte_ne_0)
L(pagecross_loop1):
	cmpdi	cr7, r5, 0
	addi	r3, r3, 1
	addi	r4, r4, 1
	bdnz	L(pagecross_loop0)
	cmpdi	cr7, r7, 0
	li	r9, 0
	bne+	cr7, L(align)
	b	L(ret1)

	.align 4
L(byte_ne_0):
	li	r7, 0
L(byte_ne_1):
	subf	r9, r9, r7
	extsw	r9, r9
	b	L(ret1)

	.align 4
L(byte_ne_2):
	extsw	r7, r9
	mr	r9, r8
	b	L(byte_ne_1)
L(byte_ne_3):
	extsw	r7, r7
	b	L(byte_ne_1)
END(STRNCMP)
libc_hidden_builtin_def(strncmp)
Commit	Line	Data
d89060d6	1	/* Optimized strncmp implementation for PowerPC64/POWER9.
688903eb	2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
d89060d6 RS	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
d89060d6 RS	18	#include <sysdep.h>
	19
	20	/* Implements the function
	21
	22	int [r3] strncmp (const char s1 [r3], const char s2 [r4], size_t [r5] n)
	23
	24	The implementation uses unaligned doubleword access to avoid specialized
	25	code paths depending of data alignment for first 32 bytes and uses
	26	vectorised loops after that. */
	27
3bc426e1 WSM	28	#ifndef STRNCMP
	29	# define STRNCMP strncmp
	30	#endif
	31
d89060d6	32	/* TODO: Change this to actual instructions when minimum binutils is upgraded
7793ad7a	33	to 2.27. Macros are defined below for these newer instructions in order
d89060d6	34	to maintain compatibility. */
7793ad7a	35	#define VCTZLSBB(r,v) .long (0x10010602 \| ((r)<<(32-11)) \| ((v)<<(32-21)))
d89060d6	36
7793ad7a	37	#define VEXTUBRX(t,a,b) .long (0x1000070d \
d89060d6 RS	38	\| ((t)<<(32-11)) \
	39	\| ((a)<<(32-16)) \
	40	\| ((b)<<(32-21)) )
	41
7793ad7a	42	#define VCMPNEZB(t,a,b) .long (0x10000507 \
d89060d6 RS	43	\| ((t)<<(32-11)) \
	44	\| ((a)<<(32-16)) \
	45	\| ((b)<<(32-21)) )
	46
	47	/* Get 16 bytes for unaligned case.
	48	reg1: Vector to hold next 16 bytes.
	49	reg2: Address to read from.
	50	reg3: Permute control vector. */
7793ad7a	51	#define GET16BYTES(reg1, reg2, reg3) \
d89060d6 RS	52	lvx reg1, 0, reg2; \
	53	vperm v8, v2, reg1, reg3; \
	54	vcmpequb. v8, v0, v8; \
	55	beq cr6, 1f; \
	56	vspltisb v9, 0; \
	57	b 2f; \
	58	.align 4; \
	59	1: \
	60	cmplw cr6, r5, r11; \
	61	ble cr6, 2f; \
	62	addi r6, reg2, 16; \
	63	lvx v9, 0, r6; \
	64	2: \
	65	vperm reg1, v9, reg1, reg3;
	66
	67	/* TODO: change this to .machine power9 when minimum binutils
	68	is upgraded to 2.27. */
	69	.machine power7
d5b41185	70	ENTRY_TOCLESS (STRNCMP, 4)
d89060d6 RS	71	/* Check if size is 0. */
	72	cmpdi cr0, r5, 0
	73	beq cr0, L(ret0)
	74	li r0, 0
	75
	76	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
	77	the code:
	78
	79	(((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
	80
	81	with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
	82	rldicl r8, r3, 0, 52
	83	cmpldi cr7, r8, 4096-32
	84	bgt cr7, L(pagecross)
	85	rldicl r9, r4, 0, 52
	86	cmpldi cr7, r9, 4096-32
	87	bgt cr7, L(pagecross)
	88
	89	/* For short strings up to 32 bytes, load both s1 and s2 using
	90	unaligned dwords and compare. */
	91
	92	ld r7, 0(r3)
	93	ld r9, 0(r4)
	94	li r8, 0
	95	cmpb r8, r7, r8
	96	cmpb r6, r7, r9
	97	orc. r8, r8, r6
	98	bne cr0, L(different1)
	99
	100	/* If the strings compared are equal, but size is less or equal
	101	to 8, return 0. */
	102	cmpldi cr7, r5, 8
	103	li r9, 0
	104	ble cr7, L(ret1)
	105	addi r5, r5, -8
	106
	107	ld r7, 8(r3)
	108	ld r9, 8(r4)
	109	cmpb r8, r7, r8
	110	cmpb r6, r7, r9
	111	orc. r8, r8, r6
	112	bne cr0, L(different1)
	113	cmpldi cr7, r5, 8
	114	mr r9, r8
	115	ble cr7, L(ret1)
	116	/* Update pointers and size. */
	117	addi r5, r5, -8
	118	addi r3, r3, 16
	119	addi r4, r4, 16
	120
	121	ld r7, 0(r3)
	122	ld r9, 0(r4)
	123	li r8, 0
	124	cmpb r8, r7, r8
	125	cmpb r6, r7, r9
	126	orc. r8, r8, r6
	127	bne cr0, L(different1)
	128	cmpldi cr7, r5, 8
	129	li r9, 0
	130	ble cr7, L(ret1)
	131	addi r5, r5, -8
	132
	133	ld r7, 8(r3)
	134	ld r9, 8(r4)
135	cmpb r8, r7, r8
136	cmpb r6, r7, r9
137	orc. r8, r8, r6
138	bne cr0, L(different1)
139	cmpldi cr7, r5, 8
140	mr r9, r8
141	ble cr7, L(ret1)
142
143	/* Update pointers and size. */
144	addi r5, r5, -8
145	addi r3, r3, 16
146	addi r4, r4, 16
147	L(align):
148	/* Now it has checked for first 32 bytes, align source1 to doubleword
149	and adjust source2 address. */
150	vspltisb v0, 0
151	vspltisb v2, -1
152	or r6, r4, r3
153	andi. r6, r6, 0xF
154	beq cr0, L(aligned)
155	lvsr v6, 0, r4 /* Compute mask. */
156	clrldi r6, r4, 60
157	subfic r11, r6, 16
158	andi. r6, r3, 0xF
159	beq cr0, L(s1_align)
160	/* Both s1 and s2 are unaligned. */
161	GET16BYTES(v5, r4, v6)
162	lvsr v10, 0, r3 /* Compute mask. */
163	clrldi r6, r3, 60
164	subfic r11, r6, 16
165	GET16BYTES(v4, r3, v10)
166	VCMPNEZB(v7, v5, v4)
167	beq cr6, L(match)
168	b L(different)
169
170	/* Align s1 to qw and adjust s2 address. */
171	.align 4
172	L(match):
173	cmpldi cr7, r5, 16
174	ble cr7, L(ret0)
175	subf r5, r11, r5
176	add r3, r3, r11
177	add r4, r4, r11
178	andi. r11, r4, 0xF
179	beq cr0, L(aligned)
180	lvsr v6, 0, r4
181	clrldi r6, r4, 60
182	subfic r11, r6, 16
183	/* There are 2 loops depending on the input alignment.
184	Each loop gets 16 bytes from s1 and s2, checks for null
185	and compares them. Loops until a mismatch or null occurs. */
186	L(s1_align):
187	lvx v4, 0, r3
188	GET16BYTES(v5, r4, v6)
189	VCMPNEZB(v7, v5, v4)
190	bne cr6, L(different)
191	cmpldi cr7, r5, 16
192	ble cr7, L(ret0)
193	addi r5, r5, -16
194	addi r3, r3, 16
195	addi r4, r4, 16
196
197	lvx v4, 0, r3
198	GET16BYTES(v5, r4, v6)
199	VCMPNEZB(v7, v5, v4)
200	bne cr6, L(different)
201	cmpldi cr7, r5, 16
202	ble cr7, L(ret0)
203	addi r5, r5, -16
204	addi r3, r3, 16
205	addi r4, r4, 16
206
207	lvx v4, 0, r3
208	GET16BYTES(v5, r4, v6)
209	VCMPNEZB(v7, v5, v4)
210	bne cr6, L(different)
211	cmpldi cr7, r5, 16
212	ble cr7, L(ret0)
213	addi r5, r5, -16
214	addi r3, r3, 16
215	addi r4, r4, 16
216
217	lvx v4, 0, r3
218	GET16BYTES(v5, r4, v6)
219	VCMPNEZB(v7, v5, v4)
220	bne cr6, L(different)
221	cmpldi cr7, r5, 16
222	ble cr7, L(ret0)
223	addi r5, r5, -16
224	addi r3, r3, 16
225	addi r4, r4, 16
226	b L(s1_align)
227	.align 4
228	L(aligned):
229	lvx v4, 0, r3
230	lvx v5, 0, r4
231	VCMPNEZB(v7, v5, v4)
232	bne cr6, L(different)
233	cmpldi cr7, r5, 16
234	ble cr7, L(ret0)
235	addi r5, r5, -16
236	addi r3, r3, 16
237	addi r4, r4, 16
238
239	lvx v4, 0, r3
240	lvx v5, 0, r4
241	VCMPNEZB(v7, v5, v4)
242	bne cr6, L(different)
243	cmpldi cr7, r5, 16
244	ble cr7, L(ret0)
245	addi r5, r5, -16
246	addi r3, r3, 16
247	addi r4, r4, 16
248
249	lvx v4, 0, r3
250	lvx v5, 0, r4
251	VCMPNEZB(v7, v5, v4)
252	bne cr6, L(different)
253	cmpldi cr7, r5, 16
254	ble cr7, L(ret0)
255	addi r5, r5, -16
256	addi r3, r3, 16
257	addi r4, r4, 16
258
259	lvx v4, 0, r3
260	lvx v5, 0, r4
261	VCMPNEZB(v7, v5, v4)
262	bne cr6, L(different)
263	cmpldi cr7, r5, 16
264	ble cr7, L(ret0)
265	addi r5, r5, -16
266	addi r3, r3, 16
267	addi r4, r4, 16
268	b L(aligned)
269	/* Calculate and return the difference. */
270	L(different):
271	VCTZLSBB(r6, v7)
272	cmplw cr7, r5, r6
273	ble cr7, L(ret0)
274	VEXTUBRX(r5, r6, v4)
275	VEXTUBRX(r4, r6, v5)
276	subf r3, r4, r5
277	extsw r3, r3
278	blr
279
280	.align 4
281	L(ret0):
282	li r9, 0
283	L(ret1):
284	mr r3, r9
285	blr
286
287	/* The code now checks if r8 and r5 are different by issuing a
288	cmpb and shifts the result based on its output:
289
290	leadzero = (__builtin_ffsl (z1) - 1);
291	leadzero = leadzero > (n-1)8 ? (n-1)8 : leadzero;
292	r1 = (r1 >> leadzero) & 0xFFUL;
293	r2 = (r2 >> leadzero) & 0xFFUL;
294	return r1 - r2; */
295
296	.align 4
297	L(different1):
298	neg r11, r8
299	sldi r5, r5, 3
300	and r8, r11, r8
301	addi r5, r5, -8
302	cntlzd r8, r8
303	subfic r8, r8, 63
304	extsw r8, r8
305	cmpld cr7, r8, r5
306	ble cr7, L(different2)
307	mr r8, r5
308	L(different2):
309	extsw r8, r8
310	srd r7, r7, r8
311	srd r9, r9, r8
312	rldicl r3, r7, 0, 56
313	rldicl r9, r9, 0, 56
314	subf r9, r9, 3
315	extsw r9, r9
316	mr r3, r9
317	blr
318
319	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
320	a simple byte a byte comparison until the page alignment for s1
321	is reached. */
322	.align 4
323	L(pagecross):
324	lbz r7, 0(r3)
325	lbz r9, 0(r4)
326	subfic r8, r8,4095
327	cmplw cr7, r9, r7
328	bne cr7, L(byte_ne_3)
329	cmpdi cr7, r9, 0
330	beq cr7, L(byte_ne_0)
331	addi r5, r5, -1
332	subf r7, r8, r5
333	subf r9, r7, r5
334	addi r9, r9, 1
335	mtctr r9
336	b L(pagecross_loop1)
337
338	.align 4
339	L(pagecross_loop0):
340	beq cr7, L(ret0)
341	lbz r9, 0(r3)
342	lbz r8, 0(r4)
343	addi r5, r5, -1
344	cmplw cr7, r9, r8
345	cmpdi cr5, r9, 0
346	bne cr7, L(byte_ne_2)
347	beq cr5, L(byte_ne_0)
348	L(pagecross_loop1):
349	cmpdi cr7, r5, 0
350	addi r3, r3, 1
351	addi r4, r4, 1
352	bdnz L(pagecross_loop0)
353	cmpdi cr7, r7, 0
354	li r9, 0
355	bne+ cr7, L(align)
356	b L(ret1)
357
358	.align 4
359	L(byte_ne_0):
360	li r7, 0
361	L(byte_ne_1):
362	subf r9, r9, r7
363	extsw r9, r9
364	b L(ret1)
365
366	.align 4
367	L(byte_ne_2):
368	extsw r7, r9
369	mr r9, r8
370	b L(byte_ne_1)
371	L(byte_ne_3):
372	extsw r7, r7
373	b L(byte_ne_1)
3bc426e1	374	END(STRNCMP)
d89060d6	375	libc_hidden_builtin_def(strncmp)