[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcmp-sse2.S

/* memcmp with SSE2.
   Copyright (C) 2017-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */


#include <isa-level.h>

/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
   so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)

#include <sysdep.h>

# ifndef MEMCMP
#  define MEMCMP __memcmp_sse2
# endif

# ifdef USE_AS_WMEMCMP
#  define PCMPEQ	pcmpeqd
#  define CHAR_SIZE	4
#  define SIZE_OFFSET	(0)
# else
#  define PCMPEQ	pcmpeqb
#  define CHAR_SIZE	1
# endif

# ifdef USE_AS_MEMCMPEQ
#  define SIZE_OFFSET	(0)
#  define CHECK_CMP(x, y)	subl x, y
# else
#  ifndef SIZE_OFFSET
#   define SIZE_OFFSET	(CHAR_PER_VEC * 2)
#  endif
#  define CHECK_CMP(x, y)	cmpl x, y
# endif

# define VEC_SIZE	16
# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)

# ifndef MEMCMP
#  define MEMCMP	memcmp
# endif

	.text
ENTRY(MEMCMP)
#  ifdef __ILP32__
	/* Clear the upper 32 bits.  */
	movl	%edx, %edx
#  endif
# ifdef USE_AS_WMEMCMP
	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
	   in ecx for code size. This is preferable to using `incw` as
	   it avoids partial register stalls on older hardware (pre
	   SnB).  */
	movl	$0xffff, %ecx
# endif
	cmpq	$CHAR_PER_VEC, %rdx
	ja	L(more_1x_vec)

# ifdef USE_AS_WMEMCMP
	/* saves a byte of code keeping the fall through path n = [2, 4]
	   in the initial cache line.  */
	decl	%edx
	jle	L(cmp_0_1)

	movq	(%rsi), %xmm0
	movq	(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_0)

	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_end_0_adj)
# else
	cmpl	$8, %edx
	ja	L(cmp_9_16)

	cmpl	$4, %edx
	jb	L(cmp_0_3)

#  ifdef USE_AS_MEMCMPEQ
	movl	(%rsi), %eax
	subl	(%rdi), %eax

	movl	-4(%rsi, %rdx), %esi
	subl	-4(%rdi, %rdx), %esi

	orl	%esi, %eax
	ret
#  else
	/* Combine comparisons for lo and hi 4-byte comparisons.  */
	movl	-4(%rsi, %rdx), %ecx
	movl	-4(%rdi, %rdx), %eax
	shlq	$32, %rcx
	shlq	$32, %rax
	movl	(%rsi), %esi
	movl	(%rdi), %edi
	orq	%rsi, %rcx
	orq	%rdi, %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)
	xorl	%eax, %eax
	ret
#  endif

	.p2align 4,, 10
L(cmp_9_16):
#  ifdef USE_AS_MEMCMPEQ
	movq	(%rsi), %rax
	subq	(%rdi), %rax

	movq	-8(%rsi, %rdx), %rcx
	subq	-8(%rdi, %rdx), %rcx
	orq	%rcx, %rax
	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
	   return long).  */
	setnz	%cl
	movzbl	%cl, %eax
#  else
	movq	(%rsi), %rcx
	movq	(%rdi), %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)

	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)
	xorl	%eax, %eax
#  endif
# endif
	ret

	.p2align 4,, 8
L(cmp_0_1):
	/* Flag set by earlier comparison against 1.  */
	jne	L(cmp_0_0)
# ifdef USE_AS_WMEMCMP
	movl	(%rdi), %ecx
	xorl	%edx, %edx
	cmpl	(%rsi), %ecx
	je	L(cmp_0_0)
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
# else
	movzbl	(%rdi), %eax
	movzbl	(%rsi), %ecx
	subl	%ecx, %eax
# endif
	ret

	/* Fits in aligning bytes.  */
L(cmp_0_0):
	xorl	%eax, %eax
	ret

# ifdef USE_AS_WMEMCMP
	.p2align 4
L(ret_nonzero_vec_start_0):
	bsfl	%eax, %eax
	movl	(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
	ret
# else

#  ifndef USE_AS_MEMCMPEQ
	.p2align 4,, 14
L(ret_nonzero):
	/* Need to bswap to get proper return without branch.  */
	bswapq	%rcx
	bswapq	%rax
	subq	%rcx, %rax
	sbbl	%eax, %eax
	orl	$1, %eax
	ret
#  endif

	.p2align 4
L(cmp_0_3):
#  ifdef USE_AS_MEMCMPEQ
	/* No reason to add to dependency chain on rdx. Saving a the
	   bytes here doesn't change number of fetch blocks.  */
	cmpl	$1, %edx
	jbe	L(cmp_0_1)
#  else
	/* We need the code size to prevent taking an extra fetch block.
	 */
	decl	%edx
	jle	L(cmp_0_1)
#  endif
	movzwl	(%rsi), %ecx
	movzwl	(%rdi), %eax

#  ifdef USE_AS_MEMCMPEQ
	subl	%ecx, %eax

	movzbl	-1(%rsi, %rdx), %esi
	movzbl	-1(%rdi, %rdx), %edi
	subl	%edi, %esi
	orl	%esi, %eax
#  else
	bswapl	%ecx
	bswapl	%eax

	/* Implicit right shift by one. We just need to displace the
	   sign bits.  */
	shrl	%ecx
	shrl	%eax

	/* Eat a partial register stall here. Saves code stopping
	   L(cmp_0_3) from bleeding into the next fetch block and saves
	   an ALU.  */
	movb	(%rsi, %rdx), %cl
	movzbl	(%rdi, %rdx), %edi
	orl	%edi, %eax
	subl	%ecx, %eax
#  endif
	ret
# endif

	.p2align 5
L(more_1x_vec):
# ifndef USE_AS_WMEMCMP
	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
	   in ecx for code size. This is preferable to using `incw` as
	   it avoids partial register stalls on older hardware (pre
	   SnB).  */
	movl	$0xffff, %ecx
# endif
	movups	(%rsi), %xmm0
	movups	(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_0)
# if SIZE_OFFSET == 0
	cmpq	$(CHAR_PER_VEC * 2), %rdx
# else
	/* Offset rdx. Saves just enough code size to keep the
	   L(last_2x_vec) case and the non-zero return in a single
	   cache line.  */
	subq	$(CHAR_PER_VEC * 2), %rdx
# endif
	ja	L(more_2x_vec)

	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
# ifndef USE_AS_MEMCMPEQ
	/* Don't use `incw ax` as machines this code runs on are liable
	   to have partial register stall.  */
	jnz	L(ret_nonzero_vec_end_0)
# else
	/* Various return targets for memcmpeq. Will always be hot in
	   Icache and get short encoding.  */
L(ret_nonzero_vec_start_1):
L(ret_nonzero_vec_start_0):
L(ret_nonzero_vec_end_0):
# endif
	ret

# ifndef USE_AS_MEMCMPEQ
#  ifdef USE_AS_WMEMCMP
	.p2align 4
L(ret_nonzero_vec_end_0_adj):
	addl	$3, %edx
#  else
	.p2align 4,, 8
#  endif
L(ret_nonzero_vec_end_0):
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	leal	(%rax, %rdx, CHAR_SIZE), %eax
	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	/* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
       is negative value of the sum will be usable as a 64-bit offset
       (negative 32-bit numbers zero-extend to a large and often
       out-of-bounds 64-bit offsets).  Note that `rax` + `rdx` >= 0 is
       an invariant when `memcmp` is used correctly, but if the input
       strings `rsi`/`rdi` are concurrently modified as the function
       runs (there is a Data-Race) it is possible for `rax` + `rdx` to
       be negative.  Given that there is virtually no extra to cost
       using `addq` instead of `addl` we may as well protect the
       data-race case.  */
	addq	%rdx, %rax
	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
#  ifndef USE_AS_WMEMCMP
	.p2align 4,, 10
L(ret_nonzero_vec_start_0):
	bsfl	%eax, %eax
	movzbl	(%rsi, %rax), %ecx
	movzbl	(%rdi, %rax), %eax
	subl	%ecx, %eax
	ret
#  endif
# else
# endif

	.p2align 5
L(more_2x_vec):
	movups	(VEC_SIZE * 1)(%rsi), %xmm0
	movups	(VEC_SIZE * 1)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_1)

	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
	jbe	L(last_2x_vec)

	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
	ja	L(more_8x_vec)

	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
	   This can harm performance if non-zero return in [65, 80] or
	   [97, 112] but helps performance otherwise. Generally zero-
	   return is hotter.  */
	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 2)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 3)(%rsi), %xmm2
	movups	(VEC_SIZE * 3)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
	jnz	L(ret_nonzero_vec_start_2_3)

	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
	jbe	L(last_2x_vec)

	movups	(VEC_SIZE * 4)(%rsi), %xmm0
	movups	(VEC_SIZE * 4)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 5)(%rsi), %xmm2
	movups	(VEC_SIZE * 5)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
# ifdef USE_AS_MEMCMPEQ
	jz	L(last_2x_vec)
	ret
# else
	jnz	L(ret_nonzero_vec_start_4_5)
# endif
	.p2align 4
L(last_2x_vec):
	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	subl	%ecx, %eax
# ifdef USE_AS_MEMCMPEQ
	/* Various return targets for memcmpeq. Will always be hot in
	   Icache and get short encoding.  */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
	ret
# else
	jnz	L(ret_nonzero_vec_end_1)
	ret

	.p2align 4,, 8
L(ret_nonzero_vec_end_1):
	pmovmskb %xmm1, %ecx
	/* High 16 bits of eax guranteed to be all ones. Rotate them in
	   to we can do `or + not` with just `xor`.  */
	rorl	$16, %eax
	xorl	%ecx, %eax
	/* Partial register stall.  */

	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	leal	(%rax, %rdx, CHAR_SIZE), %eax
	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	addl	%edx, %eax
	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4
L(ret_nonzero_vec_start_4_5):
	pmovmskb %xmm1, %edx
	sall	$16, %eax
	leal	1(%rax, %rdx), %eax
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4,, 8
L(ret_nonzero_vec_start_1):
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
# endif

	.p2align 4
L(more_8x_vec):
	subq	%rdi, %rsi
	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
	andq	$(VEC_SIZE * -1), %rdi
	addq	%rdi, %rsi
	.p2align 4
L(loop_4x):
	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 3)(%rsi), %xmm1

	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1

	movups	(VEC_SIZE * 4)(%rsi), %xmm2
	movups	(VEC_SIZE * 5)(%rsi), %xmm3

	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3

	pand	%xmm0, %xmm1
	pand	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_loop)

	addq	$(VEC_SIZE * 4), %rdi
	addq	$(VEC_SIZE * 4), %rsi
	cmpq	%rdi, %rdx
	ja	L(loop_4x)
	/* Get remaining length in edx.  */
	subl	%edi, %edx
	/* Restore offset so we can reuse L(last_2x_vec).  */
	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
# ifdef USE_AS_WMEMCMP
	shrl	$2, %edx
# endif
	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
	jbe	L(last_2x_vec)


	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 2)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 3)(%rsi), %xmm2
	movups	(VEC_SIZE * 3)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
	jz	L(last_2x_vec)
# ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
	ret
# else

	.p2align 4
L(ret_nonzero_vec_start_2_3):
	pmovmskb %xmm1, %edx
	sall	$16, %eax
	leal	1(%rax, %rdx), %eax

	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4
L(ret_nonzero_loop):
	pmovmskb %xmm0, %ecx
	pmovmskb %xmm1, %edx
	sall	$(VEC_SIZE * 1), %edx
	leal	1(%rcx, %rdx), %edx
	pmovmskb %xmm2, %ecx
	/* High 16 bits of eax guranteed to be all ones. Rotate them in
	   to we can do `or + not` with just `xor`.  */
	rorl	$16, %eax
	xorl	%ecx, %eax

	salq	$32, %rax
	orq	%rdx, %rax

	bsfq	%rax, %rax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
# endif
END(MEMCMP)
#endif
Commit	Line	Data
b91a52d0	1	/* memcmp with SSE2.
6d7e8eda	2	Copyright (C) 2017-2023 Free Software Foundation, Inc.
b91a52d0 L	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
b91a52d0	18
ae308947 NG	19
	20	#include <isa-level.h>
	21
	22	/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
	23	so we need this to build for ISA V2 builds. */
	24	#if ISA_SHOULD_BUILD (2)
	25
	26	#include <sysdep.h>
	27
8804157a NG	28	# ifndef MEMCMP
8804157a NG	29	# define MEMCMP __memcmp_sse2
cf4fd28e	30	# endif
b91a52d0	31
ae308947 NG	32	# ifdef USE_AS_WMEMCMP
	33	# define PCMPEQ pcmpeqd
	34	# define CHAR_SIZE 4
	35	# define SIZE_OFFSET (0)
	36	# else
	37	# define PCMPEQ pcmpeqb
	38	# define CHAR_SIZE 1
	39	# endif
9894127d	40
ae308947 NG	41	# ifdef USE_AS_MEMCMPEQ
	42	# define SIZE_OFFSET (0)
	43	# define CHECK_CMP(x, y) subl x, y
	44	# else
	45	# ifndef SIZE_OFFSET
	46	# define SIZE_OFFSET (CHAR_PER_VEC * 2)
	47	# endif
	48	# define CHECK_CMP(x, y) cmpl x, y
b91a52d0 L	49	# endif
b91a52d0 L	50
ae308947 NG	51	# define VEC_SIZE 16
ae308947 NG	52	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
44829b3d	53
ae308947 NG	54	# ifndef MEMCMP
	55	# define MEMCMP memcmp
	56	# endif
	57
	58	.text
	59	ENTRY(MEMCMP)
	60	# ifdef __ILP32__
	61	/* Clear the upper 32 bits. */
	62	movl %edx, %edx
	63	# endif
	64	# ifdef USE_AS_WMEMCMP
	65	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
	66	in ecx for code size. This is preferable to using `incw` as
	67	it avoids partial register stalls on older hardware (pre
	68	SnB). */
	69	movl $0xffff, %ecx
	70	# endif
	71	cmpq $CHAR_PER_VEC, %rdx
	72	ja L(more_1x_vec)
	73
	74	# ifdef USE_AS_WMEMCMP
	75	/* saves a byte of code keeping the fall through path n = [2, 4]
	76	in the initial cache line. */
	77	decl %edx
	78	jle L(cmp_0_1)
	79
	80	movq (%rsi), %xmm0
	81	movq (%rdi), %xmm1
	82	PCMPEQ %xmm0, %xmm1
	83	pmovmskb %xmm1, %eax
	84	subl %ecx, %eax
	85	jnz L(ret_nonzero_vec_start_0)
	86
	87	movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
	88	movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
	89	PCMPEQ %xmm0, %xmm1
	90	pmovmskb %xmm1, %eax
	91	subl %ecx, %eax
	92	jnz L(ret_nonzero_vec_end_0_adj)
	93	# else
	94	cmpl $8, %edx
	95	ja L(cmp_9_16)
	96
	97	cmpl $4, %edx
	98	jb L(cmp_0_3)
	99
	100	# ifdef USE_AS_MEMCMPEQ
	101	movl (%rsi), %eax
	102	subl (%rdi), %eax
	103
	104	movl -4(%rsi, %rdx), %esi
	105	subl -4(%rdi, %rdx), %esi
	106
	107	orl %esi, %eax
	108	ret
	109	# else
	110	/* Combine comparisons for lo and hi 4-byte comparisons. */
	111	movl -4(%rsi, %rdx), %ecx
	112	movl -4(%rdi, %rdx), %eax
	113	shlq $32, %rcx
	114	shlq $32, %rax
	115	movl (%rsi), %esi
	116	movl (%rdi), %edi
	117	orq %rsi, %rcx
118	orq %rdi, %rax
119	/* Only compute proper return if not-equal. */
120	cmpq %rcx, %rax
121	jnz L(ret_nonzero)
122	xorl %eax, %eax
123	ret
124	# endif
125
126	.p2align 4,, 10
127	L(cmp_9_16):
128	# ifdef USE_AS_MEMCMPEQ
129	movq (%rsi), %rax
130	subq (%rdi), %rax
131
132	movq -8(%rsi, %rdx), %rcx
133	subq -8(%rdi, %rdx), %rcx
134	orq %rcx, %rax
135	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
136	return long). */
137	setnz %cl
138	movzbl %cl, %eax
139	# else
140	movq (%rsi), %rcx
141	movq (%rdi), %rax
142	/* Only compute proper return if not-equal. */
143	cmpq %rcx, %rax
144	jnz L(ret_nonzero)
145
146	movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
147	movq -8(%rdi, %rdx, CHAR_SIZE), %rax
148	/* Only compute proper return if not-equal. */
149	cmpq %rcx, %rax
150	jnz L(ret_nonzero)
151	xorl %eax, %eax
152	# endif
153	# endif
154	ret
155
156	.p2align 4,, 8
157	L(cmp_0_1):
158	/* Flag set by earlier comparison against 1. */
159	jne L(cmp_0_0)
160	# ifdef USE_AS_WMEMCMP
161	movl (%rdi), %ecx
162	xorl %edx, %edx
163	cmpl (%rsi), %ecx
164	je L(cmp_0_0)
165	setg %dl
166	leal -1(%rdx, %rdx), %eax
167	# else
168	movzbl (%rdi), %eax
169	movzbl (%rsi), %ecx
170	subl %ecx, %eax
171	# endif
172	ret
173
174	/* Fits in aligning bytes. */
175	L(cmp_0_0):
176	xorl %eax, %eax
177	ret
178
179	# ifdef USE_AS_WMEMCMP
180	.p2align 4
181	L(ret_nonzero_vec_start_0):
182	bsfl %eax, %eax
183	movl (%rdi, %rax), %ecx
184	xorl %edx, %edx
185	cmpl (%rsi, %rax), %ecx
186	/* NB: no partial register stall here because xorl zero idiom
187	above. */
188	setg %dl
189	leal -1(%rdx, %rdx), %eax
190	ret
191	# else
192
193	# ifndef USE_AS_MEMCMPEQ
194	.p2align 4,, 14
195	L(ret_nonzero):
196	/* Need to bswap to get proper return without branch. */
197	bswapq %rcx
198	bswapq %rax
199	subq %rcx, %rax
200	sbbl %eax, %eax
201	orl $1, %eax
202	ret
203	# endif
204
205	.p2align 4
206	L(cmp_0_3):
207	# ifdef USE_AS_MEMCMPEQ
208	/* No reason to add to dependency chain on rdx. Saving a the
209	bytes here doesn't change number of fetch blocks. */
210	cmpl $1, %edx
211	jbe L(cmp_0_1)
212	# else
213	/* We need the code size to prevent taking an extra fetch block.
214	*/
215	decl %edx
216	jle L(cmp_0_1)
217	# endif
218	movzwl (%rsi), %ecx
219	movzwl (%rdi), %eax
220
221	# ifdef USE_AS_MEMCMPEQ
222	subl %ecx, %eax
223
224	movzbl -1(%rsi, %rdx), %esi
225	movzbl -1(%rdi, %rdx), %edi
226	subl %edi, %esi
227	orl %esi, %eax
228	# else
229	bswapl %ecx
230	bswapl %eax
231
232	/* Implicit right shift by one. We just need to displace the
233	sign bits. */
234	shrl %ecx
235	shrl %eax
236
237	/* Eat a partial register stall here. Saves code stopping
238	L(cmp_0_3) from bleeding into the next fetch block and saves
239	an ALU. */
240	movb (%rsi, %rdx), %cl
241	movzbl (%rdi, %rdx), %edi
242	orl %edi, %eax
243	subl %ecx, %eax
244	# endif
245	ret
246	# endif
247
248	.p2align 5
249	L(more_1x_vec):
250	# ifndef USE_AS_WMEMCMP
251	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
252	in ecx for code size. This is preferable to using `incw` as
253	it avoids partial register stalls on older hardware (pre
254	SnB). */
255	movl $0xffff, %ecx
256	# endif
257	movups (%rsi), %xmm0
258	movups (%rdi), %xmm1
259	PCMPEQ %xmm0, %xmm1
260	pmovmskb %xmm1, %eax
261	subl %ecx, %eax
262	jnz L(ret_nonzero_vec_start_0)
263	# if SIZE_OFFSET == 0
264	cmpq $(CHAR_PER_VEC * 2), %rdx
265	# else
266	/* Offset rdx. Saves just enough code size to keep the
267	L(last_2x_vec) case and the non-zero return in a single
268	cache line. */
269	subq $(CHAR_PER_VEC * 2), %rdx
270	# endif
271	ja L(more_2x_vec)
b91a52d0	272
ae308947 NG	273	movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
	274	movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
	275	PCMPEQ %xmm0, %xmm1
	276	pmovmskb %xmm1, %eax
	277	subl %ecx, %eax
	278	# ifndef USE_AS_MEMCMPEQ
	279	/* Don't use `incw ax` as machines this code runs on are liable
	280	to have partial register stall. */
	281	jnz L(ret_nonzero_vec_end_0)
	282	# else
	283	/* Various return targets for memcmpeq. Will always be hot in
	284	Icache and get short encoding. */
	285	L(ret_nonzero_vec_start_1):
	286	L(ret_nonzero_vec_start_0):
	287	L(ret_nonzero_vec_end_0):
	288	# endif
	289	ret
	290
	291	# ifndef USE_AS_MEMCMPEQ
	292	# ifdef USE_AS_WMEMCMP
	293	.p2align 4
	294	L(ret_nonzero_vec_end_0_adj):
	295	addl $3, %edx
	296	# else
	297	.p2align 4,, 8
	298	# endif
	299	L(ret_nonzero_vec_end_0):
	300	bsfl %eax, %eax
	301	# ifdef USE_AS_WMEMCMP
	302	leal (%rax, %rdx, CHAR_SIZE), %eax
	303	movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
	304	xorl %edx, %edx
	305	cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	306	/* NB: no partial register stall here because xorl zero idiom
	307	above. */
	308	setg %dl
	309	leal -1(%rdx, %rdx), %eax
	310	# else
b712be52 NG	311	/* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
	312	is negative value of the sum will be usable as a 64-bit offset
	313	(negative 32-bit numbers zero-extend to a large and often
	314	out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
	315	an invariant when `memcmp` is used correctly, but if the input
	316	strings `rsi`/`rdi` are concurrently modified as the function
	317	runs (there is a Data-Race) it is possible for `rax` + `rdx` to
	318	be negative. Given that there is virtually no extra to cost
	319	using `addq` instead of `addl` we may as well protect the
	320	data-race case. */
	321	addq %rdx, %rax
ae308947 NG	322	movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	323	movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
	324	subl %ecx, %eax
	325	# endif
	326	ret
	327	# ifndef USE_AS_WMEMCMP
	328	.p2align 4,, 10
	329	L(ret_nonzero_vec_start_0):
	330	bsfl %eax, %eax
	331	movzbl (%rsi, %rax), %ecx
	332	movzbl (%rdi, %rax), %eax
	333	subl %ecx, %eax
	334	ret
	335	# endif
	336	# else
	337	# endif
	338
	339	.p2align 5
	340	L(more_2x_vec):
	341	movups (VEC_SIZE * 1)(%rsi), %xmm0
	342	movups (VEC_SIZE * 1)(%rdi), %xmm1
	343	PCMPEQ %xmm0, %xmm1
	344	pmovmskb %xmm1, %eax
	345	subl %ecx, %eax
	346	jnz L(ret_nonzero_vec_start_1)
	347
	348	cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
	349	jbe L(last_2x_vec)
	350
	351	cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
	352	ja L(more_8x_vec)
	353
	354	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
	355	This can harm performance if non-zero return in [65, 80] or
	356	[97, 112] but helps performance otherwise. Generally zero-
	357	return is hotter. */
	358	movups (VEC_SIZE * 2)(%rsi), %xmm0
	359	movups (VEC_SIZE * 2)(%rdi), %xmm1
	360	PCMPEQ %xmm0, %xmm1
	361	movups (VEC_SIZE * 3)(%rsi), %xmm2
	362	movups (VEC_SIZE * 3)(%rdi), %xmm3
	363	PCMPEQ %xmm2, %xmm3
	364	pand %xmm1, %xmm3
	365
	366	pmovmskb %xmm3, %eax
	367	CHECK_CMP (%ecx, %eax)
	368	jnz L(ret_nonzero_vec_start_2_3)
	369
	370	cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
	371	jbe L(last_2x_vec)
	372
	373	movups (VEC_SIZE * 4)(%rsi), %xmm0
	374	movups (VEC_SIZE * 4)(%rdi), %xmm1
	375	PCMPEQ %xmm0, %xmm1
	376	movups (VEC_SIZE * 5)(%rsi), %xmm2
	377	movups (VEC_SIZE * 5)(%rdi), %xmm3
	378	PCMPEQ %xmm2, %xmm3
	379	pand %xmm1, %xmm3
	380
	381	pmovmskb %xmm3, %eax
	382	CHECK_CMP (%ecx, %eax)
	383	# ifdef USE_AS_MEMCMPEQ
	384	jz L(last_2x_vec)
	385	ret
386	# else
387	jnz L(ret_nonzero_vec_start_4_5)
388	# endif
389	.p2align 4
390	L(last_2x_vec):
391	movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
392	movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
393	PCMPEQ %xmm0, %xmm1
394	movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
395	movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
396	PCMPEQ %xmm2, %xmm3
397	pand %xmm1, %xmm3
398	pmovmskb %xmm3, %eax
399	subl %ecx, %eax
400	# ifdef USE_AS_MEMCMPEQ
401	/* Various return targets for memcmpeq. Will always be hot in
402	Icache and get short encoding. */
403	L(ret_nonzero_vec_start_2_3):
404	L(ret_nonzero_vec_start_4_5):
405	ret
406	# else
407	jnz L(ret_nonzero_vec_end_1)
408	ret
409
410	.p2align 4,, 8
411	L(ret_nonzero_vec_end_1):
412	pmovmskb %xmm1, %ecx
413	/* High 16 bits of eax guranteed to be all ones. Rotate them in
414	to we can do `or + not` with just `xor`. */
415	rorl $16, %eax
416	xorl %ecx, %eax
417	/* Partial register stall. */
418
419	bsfl %eax, %eax
420	# ifdef USE_AS_WMEMCMP
421	leal (%rax, %rdx, CHAR_SIZE), %eax
422	movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
423	xorl %edx, %edx
424	cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
425	/* NB: no partial register stall here because xorl zero idiom
426	above. */
427	setg %dl
428	leal -1(%rdx, %rdx), %eax
429	# else
430	addl %edx, %eax
431	movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
432	movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
433	subl %ecx, %eax
434	# endif
435	ret
436
437	.p2align 4
438	L(ret_nonzero_vec_start_4_5):
439	pmovmskb %xmm1, %edx
440	sall $16, %eax
441	leal 1(%rax, %rdx), %eax
442	bsfl %eax, %eax
443	# ifdef USE_AS_WMEMCMP
444	movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
445	xorl %edx, %edx
446	cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
447	/* NB: no partial register stall here because xorl zero idiom
448	above. */
449	setg %dl
450	leal -1(%rdx, %rdx), %eax
451	# else
452	movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
453	movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
454	subl %ecx, %eax
455	# endif
456	ret
457
458	.p2align 4,, 8
459	L(ret_nonzero_vec_start_1):
460	bsfl %eax, %eax
461	# ifdef USE_AS_WMEMCMP
462	movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
463	xorl %edx, %edx
464	cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
465	/* NB: no partial register stall here because xorl zero idiom
466	above. */
467	setg %dl
468	leal -1(%rdx, %rdx), %eax
469	# else
470	movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
471	movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
472	subl %ecx, %eax
473	# endif
474	ret
475	# endif
476
477	.p2align 4
478	L(more_8x_vec):
479	subq %rdi, %rsi
480	leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
481	andq $(VEC_SIZE * -1), %rdi
482	addq %rdi, %rsi
483	.p2align 4
484	L(loop_4x):
485	movups (VEC_SIZE * 2)(%rsi), %xmm0
486	movups (VEC_SIZE * 3)(%rsi), %xmm1
487
488	PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
489	PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
490
491	movups (VEC_SIZE * 4)(%rsi), %xmm2
492	movups (VEC_SIZE * 5)(%rsi), %xmm3
493
494	PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
495	PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
496
497	pand %xmm0, %xmm1
498	pand %xmm2, %xmm3
499	pand %xmm1, %xmm3
500
501	pmovmskb %xmm3, %eax
502	subl %ecx, %eax
503	jnz L(ret_nonzero_loop)
504
505	addq $(VEC_SIZE * 4), %rdi
506	addq $(VEC_SIZE * 4), %rsi
507	cmpq %rdi, %rdx
508	ja L(loop_4x)
509	/* Get remaining length in edx. */
510	subl %edi, %edx
511	/* Restore offset so we can reuse L(last_2x_vec). */
512	addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
513	# ifdef USE_AS_WMEMCMP
514	shrl $2, %edx
515	# endif
516	cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
517	jbe L(last_2x_vec)
518
519
520	movups (VEC_SIZE * 2)(%rsi), %xmm0
521	movups (VEC_SIZE * 2)(%rdi), %xmm1
522	PCMPEQ %xmm0, %xmm1
523	movups (VEC_SIZE * 3)(%rsi), %xmm2
524	movups (VEC_SIZE * 3)(%rdi), %xmm3
525	PCMPEQ %xmm2, %xmm3
526	pand %xmm1, %xmm3
527
528	pmovmskb %xmm3, %eax
529	CHECK_CMP (%ecx, %eax)
530	jz L(last_2x_vec)
531	# ifdef USE_AS_MEMCMPEQ
532	L(ret_nonzero_loop):
533	ret
534	# else
535
536	.p2align 4
537	L(ret_nonzero_vec_start_2_3):
538	pmovmskb %xmm1, %edx
539	sall $16, %eax
540	leal 1(%rax, %rdx), %eax
541
542	bsfl %eax, %eax
543	# ifdef USE_AS_WMEMCMP
544	movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
545	xorl %edx, %edx
546	cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
547	/* NB: no partial register stall here because xorl zero idiom
548	above. */
549	setg %dl
550	leal -1(%rdx, %rdx), %eax
551	# else
552	movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
553	movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
554	subl %ecx, %eax
555	# endif
556	ret
557
558	.p2align 4
559	L(ret_nonzero_loop):
560	pmovmskb %xmm0, %ecx
561	pmovmskb %xmm1, %edx
562	sall $(VEC_SIZE * 1), %edx
563	leal 1(%rcx, %rdx), %edx
564	pmovmskb %xmm2, %ecx
565	/* High 16 bits of eax guranteed to be all ones. Rotate them in
566	to we can do `or + not` with just `xor`. */
567	rorl $16, %eax
568	xorl %ecx, %eax
569
570	salq $32, %rax
571	orq %rdx, %rax
572
573	bsfq %rax, %rax
574	# ifdef USE_AS_WMEMCMP
575	movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
576	xorl %edx, %edx
577	cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
578	/* NB: no partial register stall here because xorl zero idiom
579	above. */
580	setg %dl
581	leal -1(%rdx, %rdx), %eax
582	# else
583	movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
584	movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
585	subl %ecx, %eax
586	# endif
587	ret
588	# endif
589	END(MEMCMP)
590	#endif