[thirdparty/glibc.git] / sysdeps / aarch64 / strncmp.S

/* Copyright (C) 2013-2023 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* Assumptions:
 *
 * ARMv8-a, AArch64
 */

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f

/* Parameters and result.  */
#define src1		x0
#define src2		x1
#define limit		x2
#define result		x0

/* Internal variables.  */
#define data1		x3
#define data1w		w3
#define data2		x4
#define data2w		w4
#define has_nul		x5
#define diff		x6
#define syndrome	x7
#define tmp1		x8
#define tmp2		x9
#define tmp3		x10
#define zeroones	x11
#define pos		x12
#define mask		x13
#define endloop		x14
#define count		mask
#define offset		pos
#define neg_offset	x15

/* Define endian dependent shift operations.
   On big-endian early bytes are at MSB and on little-endian LSB.
   LS_FW means shifting towards early bytes.
   LS_BK means shifting towards later bytes.
   */
#ifdef __AARCH64EB__
#define LS_FW lsl
#define LS_BK lsr
#else
#define LS_FW lsr
#define LS_BK lsl
#endif

	.text
	.p2align 6
	.rep 9
	nop	/* Pad so that the loop below fits a cache line.  */
	.endr
ENTRY_ALIGN (strncmp, 0)
	cbz	limit, L(ret0)
	eor	tmp1, src1, src2
	mov	zeroones, #REP8_01
	tst	tmp1, #7
	and	count, src1, #7
	b.ne	L(misaligned8)
	cbnz	count, L(mutual_align)

	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
	   can be done in parallel across the entire word.  */
	/* Start of performance-critical section  -- one 64B cache line.  */
L(loop_aligned):
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8
L(start_realigned):
	subs	limit, limit, #8
	sub	tmp1, data1, zeroones
	orr	tmp2, data1, #REP8_7f
	eor	diff, data1, data2	/* Non-zero if differences found.  */
	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
	ccmp	endloop, #0, #0, eq
	b.eq	L(loop_aligned)
	/* End of performance-critical section  -- one 64B cache line.  */

L(full_check):
#ifndef __AARCH64EB__
	orr	syndrome, diff, has_nul
	add	limit, limit, 8	/* Rewind limit to before last subs. */
L(syndrome_check):
	/* Limit was reached. Check if the NUL byte or the difference
	   is before the limit. */
	rev	syndrome, syndrome
	rev	data1, data1
	clz	pos, syndrome
	rev	data2, data2
	lsl	data1, data1, pos
	cmp	limit, pos, lsr #3
	lsl	data2, data2, pos
	/* But we need to zero-extend (char is unsigned) the value and then
	   perform a signed 32-bit subtraction.  */
	lsr	data1, data1, #56
	sub	result, data1, data2, lsr #56
	csel result, result, xzr, hi
	ret
#else
	/* Not reached the limit, must have found the end or a diff.  */
	tbz	limit, #63, L(not_limit)
	add	tmp1, limit, 8
	cbz	limit, L(not_limit)

	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
	mov	mask, #~0
	lsr	mask, mask, limit
	bic	data1, data1, mask
	bic	data2, data2, mask

	/* Make sure that the NUL byte is marked in the syndrome.  */
	orr	has_nul, has_nul, mask

L(not_limit):
	/* For big-endian we cannot use the trick with the syndrome value
	   as carry-propagation can corrupt the upper bits if the trailing
	   bytes in the string contain 0x01.  */
	/* However, if there is no NUL byte in the dword, we can generate
	   the result directly.  We can't just subtract the bytes as the
	   MSB might be significant.  */
	cbnz	has_nul, 1f
	cmp	data1, data2
	cset	result, ne
	cneg	result, result, lo
	ret
1:
	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
	rev	tmp3, data1
	sub	tmp1, tmp3, zeroones
	orr	tmp2, tmp3, #REP8_7f
	bic	has_nul, tmp1, tmp2
	rev	has_nul, has_nul
	orr	syndrome, diff, has_nul
	clz	pos, syndrome
	/* The most-significant-non-zero bit of the syndrome marks either the
	   first bit that is different, or the top bit of the first zero byte.
	   Shifting left now will bring the critical information into the
	   top bits.  */
L(end_quick):
	lsl	data1, data1, pos
	lsl	data2, data2, pos
	/* But we need to zero-extend (char is unsigned) the value and then
	   perform a signed 32-bit subtraction.  */
	lsr	data1, data1, #56
	sub	result, data1, data2, lsr #56
	ret
#endif

L(mutual_align):
	/* Sources are mutually aligned, but are not currently at an
	   alignment boundary.  Round down the addresses and then mask off
	   the bytes that precede the start point.
	   We also need to adjust the limit calculations, but without
	   overflowing if the limit is near ULONG_MAX.  */
	bic	src1, src1, #7
	bic	src2, src2, #7
	ldr	data1, [src1], #8
	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
	ldr	data2, [src2], #8
	mov	tmp2, #~0
	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
	/* Adjust the limit and ensure it doesn't overflow.  */
	adds	limit, limit, count
	csinv	limit, limit, xzr, lo
	orr	data1, data1, tmp2
	orr	data2, data2, tmp2
	b	L(start_realigned)

	.p2align 6
	/* Don't bother with dwords for up to 16 bytes.  */
L(misaligned8):
	cmp	limit, #16
	b.hs	L(try_misaligned_words)

L(byte_loop):
	/* Perhaps we can do better than this.  */
	ldrb	data1w, [src1], #1
	ldrb	data2w, [src2], #1
	subs	limit, limit, #1
	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
	b.eq	L(byte_loop)
L(done):
	sub	result, data1, data2
	ret
	/* Align the SRC1 to a dword by doing a bytewise compare and then do
	   the dword loop.  */
L(try_misaligned_words):
	cbz	count, L(src1_aligned)

	neg	count, count
	and	count, count, #7
	sub	limit, limit, count

L(page_end_loop):
	ldrb	data1w, [src1], #1
	ldrb	data2w, [src2], #1
	cmp	data1w, #1
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
	b.ne	L(done)
	subs	count, count, #1
	b.hi	L(page_end_loop)

	/* The following diagram explains the comparison of misaligned strings.
	   The bytes are shown in natural order. For little-endian, it is
	   reversed in the registers. The "x" bytes are before the string.
	   The "|" separates data that is loaded at one time.
	   src1     | a a a a a a a a | b b b c c c c c | . . .
	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
	   After shifting in each step, the data looks like this:
	                STEP_A              STEP_B              STEP_C
	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
	   The bytes with "0" are eliminated from the syndrome via mask.
	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
	   time from SRC2. The comparison happens in 3 steps. After each step
	   the loop can exit, or read from SRC1 or SRC2. */
L(src1_aligned):
	/* Calculate offset from 8 byte alignment to string start in bits. No
	   need to mask offset since shifts are ignoring upper bits. */
	lsl	offset, src2, #3
	bic	src2, src2, #0xf
	mov	mask, -1
	neg	neg_offset, offset
	ldr	data1, [src1], #8
	ldp	tmp1, tmp2, [src2], #16
	LS_BK	mask, mask, neg_offset
	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
	/* Skip the first compare if data in tmp1 is irrelevant. */
	tbnz	offset, 6, L(misaligned_mid_loop)

L(loop_misaligned):
	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
	LS_FW	data2, tmp1, offset
	LS_BK	tmp1, tmp2, neg_offset
	subs	limit, limit, #8
	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
	sub	has_nul, data1, zeroones
	eor	diff, data1, data2	/* Non-zero if differences found.  */
	orr	tmp3, data1, #REP8_7f
	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
	orr	tmp3, endloop, has_nul
	cbnz	tmp3, L(full_check)

	ldr	data1, [src1], #8
L(misaligned_mid_loop):
	/* STEP_B: Compare first part of data1 to second part of tmp2. */
	LS_FW	data2, tmp2, offset
#ifdef __AARCH64EB__
	/* For big-endian we do a byte reverse to avoid carry-propagation
	problem described above. This way we can reuse the has_nul in the
	next step and also use syndrome value trick at the end. */
	rev	tmp3, data1
	#define data1_fixed tmp3
#else
	#define data1_fixed data1
#endif
	sub	has_nul, data1_fixed, zeroones
	orr	tmp3, data1_fixed, #REP8_7f
	eor	diff, data2, data1	/* Non-zero if differences found.  */
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
#ifdef __AARCH64EB__
	rev	has_nul, has_nul
#endif
	cmp	limit, neg_offset, lsr #3
	orr	syndrome, diff, has_nul
	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
	cbnz	tmp3, L(syndrome_check)

	/* STEP_C: Compare second part of data1 to first part of tmp1. */
	ldp	tmp1, tmp2, [src2], #16
	cmp	limit, #8
	LS_BK	data2, tmp1, neg_offset
	eor	diff, data2, data1	/* Non-zero if differences found.  */
	orr	syndrome, diff, has_nul
	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
	cbnz	tmp3, L(syndrome_check)

	ldr	data1, [src1], #8
	sub	limit, limit, #8
	b	L(loop_misaligned)

#ifdef	__AARCH64EB__
L(syndrome_check):
	clz	pos, syndrome
	cmp	pos, limit, lsl #3
	b.lo	L(end_quick)
#endif

L(ret0):
	mov	result, #0
	ret

END (strncmp)
libc_hidden_builtin_def (strncmp)
Commit	Line	Data
6d7e8eda	1	/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
a0b1cd88 MS	2
	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library. If not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
a0b1cd88 MS	18
	19	#include <sysdep.h>
	20
	21	/* Assumptions:
	22	*
	23	* ARMv8-a, AArch64
	24	*/
	25
	26	#define REP8_01 0x0101010101010101
	27	#define REP8_7f 0x7f7f7f7f7f7f7f7f
a0b1cd88 MS	28
	29	/* Parameters and result. */
	30	#define src1 x0
	31	#define src2 x1
	32	#define limit x2
	33	#define result x0
	34
	35	/* Internal variables. */
	36	#define data1 x3
	37	#define data1w w3
	38	#define data2 x4
	39	#define data2w w4
	40	#define has_nul x5
	41	#define diff x6
	42	#define syndrome x7
	43	#define tmp1 x8
	44	#define tmp2 x9
	45	#define tmp3 x10
	46	#define zeroones x11
	47	#define pos x12
03e1378f AB	48	#define mask x13
03e1378f AB	49	#define endloop x14
7108f1f9	50	#define count mask
03e1378f AB	51	#define offset pos
03e1378f AB	52	#define neg_offset x15
a0b1cd88	53
03e1378f AB	54	/* Define endian dependent shift operations.
	55	On big-endian early bytes are at MSB and on little-endian LSB.
	56	LS_FW means shifting towards early bytes.
	57	LS_BK means shifting towards later bytes.
	58	*/
	59	#ifdef __AARCH64EB__
	60	#define LS_FW lsl
	61	#define LS_BK lsr
	62	#else
	63	#define LS_FW lsr
	64	#define LS_BK lsl
	65	#endif
	66
	67	.text
	68	.p2align 6
	69	.rep 9
	70	nop /* Pad so that the loop below fits a cache line. */
	71	.endr
	72	ENTRY_ALIGN (strncmp, 0)
a0b1cd88 MS	73	cbz limit, L(ret0)
	74	eor tmp1, src1, src2
	75	mov zeroones, #REP8_01
	76	tst tmp1, #7
7108f1f9	77	and count, src1, #7
a0b1cd88	78	b.ne L(misaligned8)
7108f1f9	79	cbnz count, L(mutual_align)
a0b1cd88 MS	80
	81	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
	82	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
	83	can be done in parallel across the entire word. */
	84	/* Start of performance-critical section -- one 64B cache line. */
	85	L(loop_aligned):
	86	ldr data1, [src1], #8
	87	ldr data2, [src2], #8
	88	L(start_realigned):
03e1378f	89	subs limit, limit, #8
a0b1cd88 MS	90	sub tmp1, data1, zeroones
	91	orr tmp2, data1, #REP8_7f
	92	eor diff, data1, data2 /* Non-zero if differences found. */
03e1378f	93	csinv endloop, diff, xzr, hi /* Last Dword or differences. */
a0b1cd88 MS	94	bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
	95	ccmp endloop, #0, #0, eq
	96	b.eq L(loop_aligned)
	97	/* End of performance-critical section -- one 64B cache line. */
	98
03e1378f AB	99	L(full_check):
03e1378f AB	100	#ifndef __AARCH64EB__
a0b1cd88	101	orr syndrome, diff, has_nul
03e1378f AB	102	add limit, limit, 8 /* Rewind limit to before last subs. */
	103	L(syndrome_check):
	104	/* Limit was reached. Check if the NUL byte or the difference
	105	is before the limit. */
a0b1cd88 MS	106	rev syndrome, syndrome
a0b1cd88 MS	107	rev data1, data1
a0b1cd88 MS	108	clz pos, syndrome
	109	rev data2, data2
	110	lsl data1, data1, pos
03e1378f	111	cmp limit, pos, lsr #3
a0b1cd88 MS	112	lsl data2, data2, pos
	113	/* But we need to zero-extend (char is unsigned) the value and then
	114	perform a signed 32-bit subtraction. */
	115	lsr data1, data1, #56
	116	sub result, data1, data2, lsr #56
03e1378f AB	117	csel result, result, xzr, hi
03e1378f AB	118	ret
a0b1cd88	119	#else
03e1378f AB	120	/* Not reached the limit, must have found the end or a diff. */
	121	tbz limit, #63, L(not_limit)
	122	add tmp1, limit, 8
	123	cbz limit, L(not_limit)
	124
	125	lsl limit, tmp1, #3 /* Bits -> bytes. */
	126	mov mask, #~0
	127	lsr mask, mask, limit
	128	bic data1, data1, mask
	129	bic data2, data2, mask
	130
	131	/* Make sure that the NUL byte is marked in the syndrome. */
	132	orr has_nul, has_nul, mask
	133
	134	L(not_limit):
a0b1cd88 MS	135	/* For big-endian we cannot use the trick with the syndrome value
	136	as carry-propagation can corrupt the upper bits if the trailing
	137	bytes in the string contain 0x01. */
	138	/* However, if there is no NUL byte in the dword, we can generate
	139	the result directly. We can't just subtract the bytes as the
	140	MSB might be significant. */
	141	cbnz has_nul, 1f
	142	cmp data1, data2
	143	cset result, ne
	144	cneg result, result, lo
03e1378f	145	ret
a0b1cd88 MS	146	1:
	147	/* Re-compute the NUL-byte detection, using a byte-reversed value. */
	148	rev tmp3, data1
	149	sub tmp1, tmp3, zeroones
	150	orr tmp2, tmp3, #REP8_7f
	151	bic has_nul, tmp1, tmp2
	152	rev has_nul, has_nul
	153	orr syndrome, diff, has_nul
	154	clz pos, syndrome
03e1378f AB	155	/* The most-significant-non-zero bit of the syndrome marks either the
03e1378f AB	156	first bit that is different, or the top bit of the first zero byte.
a0b1cd88 MS	157	Shifting left now will bring the critical information into the
a0b1cd88 MS	158	top bits. */
03e1378f	159	L(end_quick):
a0b1cd88 MS	160	lsl data1, data1, pos
	161	lsl data2, data2, pos
	162	/* But we need to zero-extend (char is unsigned) the value and then
	163	perform a signed 32-bit subtraction. */
	164	lsr data1, data1, #56
	165	sub result, data1, data2, lsr #56
03e1378f	166	ret
a0b1cd88 MS	167	#endif
	168
	169	L(mutual_align):
	170	/* Sources are mutually aligned, but are not currently at an
	171	alignment boundary. Round down the addresses and then mask off
	172	the bytes that precede the start point.
	173	We also need to adjust the limit calculations, but without
	174	overflowing if the limit is near ULONG_MAX. */
	175	bic src1, src1, #7
	176	bic src2, src2, #7
	177	ldr data1, [src1], #8
7108f1f9	178	neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
a0b1cd88 MS	179	ldr data2, [src2], #8
a0b1cd88 MS	180	mov tmp2, #~0
03e1378f AB	181	LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
	182	/* Adjust the limit and ensure it doesn't overflow. */
	183	adds limit, limit, count
	184	csinv limit, limit, xzr, lo
a0b1cd88 MS	185	orr data1, data1, tmp2
a0b1cd88 MS	186	orr data2, data2, tmp2
a0b1cd88 MS	187	b L(start_realigned)
a0b1cd88 MS	188
a0b1cd88	189	.p2align 6
7108f1f9	190	/* Don't bother with dwords for up to 16 bytes. */
a0b1cd88	191	L(misaligned8):
7108f1f9 SP	192	cmp limit, #16
	193	b.hs L(try_misaligned_words)
	194
	195	L(byte_loop):
a0b1cd88 MS	196	/* Perhaps we can do better than this. */
	197	ldrb data1w, [src1], #1
	198	ldrb data2w, [src2], #1
	199	subs limit, limit, #1
7108f1f9	200	ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
a0b1cd88	201	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
7108f1f9 SP	202	b.eq L(byte_loop)
7108f1f9 SP	203	L(done):
a0b1cd88	204	sub result, data1, data2
03e1378f	205	ret
7108f1f9 SP	206	/* Align the SRC1 to a dword by doing a bytewise compare and then do
	207	the dword loop. */
	208	L(try_misaligned_words):
03e1378f	209	cbz count, L(src1_aligned)
7108f1f9 SP	210
	211	neg count, count
	212	and count, count, #7
	213	sub limit, limit, count
7108f1f9 SP	214
	215	L(page_end_loop):
	216	ldrb data1w, [src1], #1
	217	ldrb data2w, [src2], #1
	218	cmp data1w, #1
	219	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
	220	b.ne L(done)
	221	subs count, count, #1
	222	b.hi L(page_end_loop)
	223
03e1378f AB	224	/* The following diagram explains the comparison of misaligned strings.
	225	The bytes are shown in natural order. For little-endian, it is
	226	reversed in the registers. The "x" bytes are before the string.
	227	The "\|" separates data that is loaded at one time.
	228	src1 \| a a a a a a a a \| b b b c c c c c \| . . .
	229	src2 \| x x x x x a a a a a a a a b b b \| c c c c c . . .
	230	After shifting in each step, the data looks like this:
	231	STEP_A STEP_B STEP_C
	232	data1 a a a a a a a a b b b c c c c c b b b c c c c c
	233	data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
	234	The bytes with "0" are eliminated from the syndrome via mask.
	235	Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
	236	time from SRC2. The comparison happens in 3 steps. After each step
	237	the loop can exit, or read from SRC1 or SRC2. */
	238	L(src1_aligned):
	239	/* Calculate offset from 8 byte alignment to string start in bits. No
	240	need to mask offset since shifts are ignoring upper bits. */
	241	lsl offset, src2, #3
	242	bic src2, src2, #0xf
	243	mov mask, -1
	244	neg neg_offset, offset
	245	ldr data1, [src1], #8
	246	ldp tmp1, tmp2, [src2], #16
	247	LS_BK mask, mask, neg_offset
	248	and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
	249	/* Skip the first compare if data in tmp1 is irrelevant. */
	250	tbnz offset, 6, L(misaligned_mid_loop)
	251
7108f1f9	252	L(loop_misaligned):
03e1378f AB	253	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
	254	LS_FW data2, tmp1, offset
	255	LS_BK tmp1, tmp2, neg_offset
	256	subs limit, limit, #8
	257	orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
	258	sub has_nul, data1, zeroones
	259	eor diff, data1, data2 /* Non-zero if differences found. */
	260	orr tmp3, data1, #REP8_7f
	261	csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
	262	bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
	263	orr tmp3, endloop, has_nul
	264	cbnz tmp3, L(full_check)
7108f1f9 SP	265
7108f1f9 SP	266	ldr data1, [src1], #8
03e1378f AB	267	L(misaligned_mid_loop):
	268	/* STEP_B: Compare first part of data1 to second part of tmp2. */
	269	LS_FW data2, tmp2, offset
	270	#ifdef __AARCH64EB__
	271	/* For big-endian we do a byte reverse to avoid carry-propagation
	272	problem described above. This way we can reuse the has_nul in the
	273	next step and also use syndrome value trick at the end. */
	274	rev tmp3, data1
	275	#define data1_fixed tmp3
	276	#else
	277	#define data1_fixed data1
	278	#endif
	279	sub has_nul, data1_fixed, zeroones
	280	orr tmp3, data1_fixed, #REP8_7f
	281	eor diff, data2, data1 /* Non-zero if differences found. */
	282	bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
	283	#ifdef __AARCH64EB__
	284	rev has_nul, has_nul
	285	#endif
	286	cmp limit, neg_offset, lsr #3
	287	orr syndrome, diff, has_nul
	288	bic syndrome, syndrome, mask /* Ignore later bytes. */
	289	csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
	290	cbnz tmp3, L(syndrome_check)
7108f1f9	291
03e1378f AB	292	/* STEP_C: Compare second part of data1 to first part of tmp1. */
	293	ldp tmp1, tmp2, [src2], #16
	294	cmp limit, #8
	295	LS_BK data2, tmp1, neg_offset
	296	eor diff, data2, data1 /* Non-zero if differences found. */
	297	orr syndrome, diff, has_nul
	298	and syndrome, syndrome, mask /* Ignore earlier bytes. */
	299	csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
	300	cbnz tmp3, L(syndrome_check)
	301
	302	ldr data1, [src1], #8
	303	sub limit, limit, #8
	304	b L(loop_misaligned)
	305
	306	#ifdef __AARCH64EB__
	307	L(syndrome_check):
	308	clz pos, syndrome
	309	cmp pos, limit, lsl #3
	310	b.lo L(end_quick)
	311	#endif
7108f1f9 SP	312
	313	L(ret0):
	314	mov result, #0
03e1378f	315	ret
7108f1f9	316
a0b1cd88 MS	317	END (strncmp)
a0b1cd88 MS	318	libc_hidden_builtin_def (strncmp)