[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcmpeq-evex.S

/* __memcmpeq optimized with EVEX.
   Copyright (C) 2017-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

#if ISA_SHOULD_BUILD (4)

/* __memcmpeq is implemented as:
   1. Use ymm vector compares when possible. The only case where
      vector compares is not possible for when size < VEC_SIZE
      and loading from either s1 or s2 would cause a page cross.
   2. Use xmm vector compare when size >= 8 bytes.
   3. Optimistically compare up to first 4 * VEC_SIZE one at a
      to check for early mismatches. Only do this if its guranteed the
      work is not wasted.
   4. If size is 8 * VEC_SIZE or less, unroll the loop.
   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
      area.
   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */

# include <sysdep.h>

# ifndef MEMCMPEQ
#  define MEMCMPEQ	__memcmpeq_evex
# endif

# ifndef VEC_SIZE
#  include "x86-evex512-vecs.h"
# endif
# include "reg-macros.h"


# if VEC_SIZE == 32

#  define TEST_ZERO_VCMP(reg)	inc %VGPR(reg)
#  define TEST_ZERO(reg)	test %VGPR(reg), %VGPR(reg)

#  define TO_32BIT_P1(reg)	/* Do nothing. */
#  define TO_32BIT_P2(reg)	/* Do nothing. */
#  define TO_32BIT(reg)	/* Do nothing. */

#  define VEC_CMP	VPCMPEQ

# elif VEC_SIZE == 64

#  define TEST_ZERO_VCMP(reg)	TEST_ZERO(reg)
#  define TEST_ZERO(reg)	neg %VGPR(reg)


	/* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
	   int. We have two methods for this. If the mask with branched
	   on, we use `neg` for the branch then `sbb` to get the 32-bit
	   return. If the mask was no branched on, we just use
	   `popcntq`.  */
#  define TO_32BIT_P1(reg)	TEST_ZERO(reg)
#  define TO_32BIT_P2(reg)	sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
#  define TO_32BIT(reg)	popcntq %reg, %reg

#  define VEC_CMP	VPCMPNEQ

# else
#  error "Unsupported VEC_SIZE"
# endif


# define VMOVU_MASK	vmovdqu8
# define VPCMPNEQ	vpcmpneqb
# define VPCMPEQ	vpcmpeqb
# define VPTEST	vptestmb

# define PAGE_SIZE	4096

	.section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN (MEMCMPEQ, 6)
# ifdef __ILP32__
	/* Clear the upper 32 bits.  */
	movl	%edx, %edx
# endif
	cmp	$VEC_SIZE, %RDX_LP
	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
	ja	L(more_1x_vec)

	/* Create mask of bytes that are guranteed to be valid because
	   of length (edx). Using masked movs allows us to skip checks
	   for page crosses/zero size.  */
	mov	$-1, %VRAX
	bzhi	%VRDX, %VRAX, %VRAX
	/* NB: A `jz` might be useful here. Page-faults that are
	   invalidated by predicate execution (the evex mask) can be
	   very slow.  The expectation is this is not the norm so and
	   "most" code will not regularly call 'memcmp' with length = 0
	   and memory that is not wired up.  */
	KMOV	%VRAX, %k2

	/* Use masked loads as VEC_SIZE could page cross where length
	   (edx) would not.  */
	VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
	VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
	KMOV	%k1, %VRAX
	TO_32BIT (VRAX)
	ret

	.p2align 4,, 3
L(last_1x_vec):
	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
	VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
	KMOV	%k1, %VRAX
	TO_32BIT_P1 (rax)
L(return_neq0):
	TO_32BIT_P2 (rax)
	ret


	.p2align 4,, 12
L(more_1x_vec):
	/* From VEC + 1 to 2 * VEC.  */
	VMOVU	(%rsi), %VMM(1)
	/* Use compare not equals to directly check for mismatch.  */
	VPCMPNEQ (%rdi), %VMM(1), %k1
	KMOV	%k1, %VRAX
	TEST_ZERO (rax)
	jnz	L(return_neq0)

	cmpq	$(VEC_SIZE * 2), %rdx
	jbe	L(last_1x_vec)

	/* Check second VEC no matter what.  */
	VMOVU	VEC_SIZE(%rsi), %VMM(2)
	VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
	KMOV	%k1, %VRAX
	TEST_ZERO (rax)
	jnz	L(return_neq0)

	/* Less than 4 * VEC.  */
	cmpq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_2x_vec)

	/* Check third and fourth VEC no matter what.  */
	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
	VEC_CMP	(VEC_SIZE * 2)(%rdi), %VMM(3), %k1
	KMOV	%k1, %VRAX
	TEST_ZERO_VCMP (rax)
	jnz	L(return_neq0)

	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
	VEC_CMP	(VEC_SIZE * 3)(%rdi), %VMM(4), %k1
	KMOV	%k1, %VRAX
	TEST_ZERO_VCMP (rax)
	jnz	L(return_neq0)

	/* Go to 4x VEC loop.  */
	cmpq	$(VEC_SIZE * 8), %rdx
	ja	L(more_8x_vec)

	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
	   branches.  */

	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
	addq	%rdx, %rdi

	/* Wait to load from s1 until addressed adjust due to
	   unlamination.  */

	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
	   will have some 1s.  */
	vpxorq	-(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
	   oring with VEC(1). Result is stored in VEC(1).  */
	vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)

	cmpl	$(VEC_SIZE * 6), %edx
	jbe	L(4x_last_2x_vec)

	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
	vpxorq	-(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
	/* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
	vpxorq	-(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)

	/* Or together VEC(4), VEC(3), and VEC(2) into VEC(2).  */
	vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)

	/* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
L(4x_last_2x_vec):
	VPTEST	%VMM(2), %VMM(2), %k1
	KMOV	%k1, %VRAX
	TO_32BIT (VRAX)
	ret


	.p2align 4,, 10
L(more_8x_vec):
	/* Set end of s1 in rdx.  */
	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
	/* rsi stores s2 - s1. This allows loop to only update one
	   pointer.  */
	subq	%rdi, %rsi
	/* Align s1 pointer.  */
	andq	$-VEC_SIZE, %rdi
	/* Adjust because first 4x vec where check already.  */
	subq	$-(VEC_SIZE * 4), %rdi
	.p2align 5,, 12
	.p2align 4,, 8
L(loop_4x_vec):
	VMOVU	(%rsi, %rdi), %VMM(1)
	vpxorq	(%rdi), %VMM(1), %VMM(1)

	VMOVU	VEC_SIZE(%rsi, %rdi), %VMM(2)
	vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)

	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
	vpxorq	(VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)

	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
	vpxorq	(VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)

	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
	VPTEST	%VMM(4), %VMM(4), %k1
	KMOV	%k1, %VRAX
	TEST_ZERO (rax)
	jnz	L(return_neq2)
	subq	$-(VEC_SIZE * 4), %rdi
	cmpq	%rdx, %rdi
	jb	L(loop_4x_vec)

	subq	%rdx, %rdi

	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
	vpxorq	(VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
	/* rdi has 4 * VEC_SIZE - remaining length.  */

	/* Load regardless of branch.  */
	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
	   oring with VEC(4). Result is stored in VEC(4).  */
	vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)

	/* Seperate logic as we can only use testb for VEC_SIZE == 64.
	 */
# if VEC_SIZE == 64
	testb	%dil, %dil
	js	L(8x_last_2x_vec)
# else
	cmpl	$(VEC_SIZE * 2), %edi
	jge	L(8x_last_2x_vec)
# endif

	VMOVU	VEC_SIZE(%rsi, %rdx), %VMM(2)
	vpxorq	VEC_SIZE(%rdx), %VMM(2), %VMM(2)

	VMOVU	(%rsi, %rdx), %VMM(1)
	vpxorq	(%rdx), %VMM(1), %VMM(1)

	vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
L(8x_last_1x_vec):
L(8x_last_2x_vec):
	VPTEST	%VMM(4), %VMM(4), %k1
	KMOV	%k1, %VRAX
	TO_32BIT_P1 (rax)
L(return_neq2):
	TO_32BIT_P2 (rax)
	ret

	.p2align 4,, 4
L(last_2x_vec):
	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
	VPTEST	%VMM(2), %VMM(2), %k1
	KMOV	%k1, %VRAX
	TO_32BIT (VRAX)
	ret

	/* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
	   next cache line.  */
END (MEMCMPEQ)
#endif
Commit	Line	Data
cf4fd28e	1	/* __memcmpeq optimized with EVEX.
6d7e8eda	2	Copyright (C) 2017-2023 Free Software Foundation, Inc.
cf4fd28e NG	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<https://www.gnu.org/licenses/>. */
	18
ae308947 NG	19	#include <isa-level.h>
	20
	21	#if ISA_SHOULD_BUILD (4)
9b7cfab1 NG	22
	23	/* __memcmpeq is implemented as:
	24	1. Use ymm vector compares when possible. The only case where
	25	vector compares is not possible for when size < VEC_SIZE
	26	and loading from either s1 or s2 would cause a page cross.
	27	2. Use xmm vector compare when size >= 8 bytes.
	28	3. Optimistically compare up to first 4 * VEC_SIZE one at a
	29	to check for early mismatches. Only do this if its guranteed the
	30	work is not wasted.
	31	4. If size is 8 * VEC_SIZE or less, unroll the loop.
	32	5. Compare 4 * VEC_SIZE at a time with the aligned first memory
	33	area.
	34	6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
	35	7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
	36	8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
	37
	38	# include <sysdep.h>
	39
	40	# ifndef MEMCMPEQ
	41	# define MEMCMPEQ __memcmpeq_evex
	42	# endif
	43
2d2493a6 NG	44	# ifndef VEC_SIZE
	45	# include "x86-evex512-vecs.h"
	46	# endif
	47	# include "reg-macros.h"
	48
	49
	50	# if VEC_SIZE == 32
	51
	52	# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
	53	# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
	54
	55	# define TO_32BIT_P1(reg) /* Do nothing. */
	56	# define TO_32BIT_P2(reg) /* Do nothing. */
	57	# define TO_32BIT(reg) /* Do nothing. */
	58
	59	# define VEC_CMP VPCMPEQ
	60
	61	# elif VEC_SIZE == 64
	62
	63	# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
	64	# define TEST_ZERO(reg) neg %VGPR(reg)
	65
	66
	67	/* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
	68	int. We have two methods for this. If the mask with branched
	69	on, we use `neg` for the branch then `sbb` to get the 32-bit
	70	return. If the mask was no branched on, we just use
	71	`popcntq`. */
	72	# define TO_32BIT_P1(reg) TEST_ZERO(reg)
	73	# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
	74	# define TO_32BIT(reg) popcntq %reg, %reg
	75
	76	# define VEC_CMP VPCMPNEQ
	77
	78	# else
	79	# error "Unsupported VEC_SIZE"
	80	# endif
	81
	82
cca457f9	83	# define VMOVU_MASK vmovdqu8
2d2493a6 NG	84	# define VPCMPNEQ vpcmpneqb
2d2493a6 NG	85	# define VPCMPEQ vpcmpeqb
9b7cfab1 NG	86	# define VPTEST vptestmb
9b7cfab1 NG	87
9b7cfab1 NG	88	# define PAGE_SIZE 4096
9b7cfab1 NG	89
2d2493a6	90	.section SECTION(.text), "ax", @progbits
9b7cfab1 NG	91	ENTRY_P2ALIGN (MEMCMPEQ, 6)
	92	# ifdef __ILP32__
	93	/* Clear the upper 32 bits. */
	94	movl %edx, %edx
	95	# endif
	96	cmp $VEC_SIZE, %RDX_LP
cca457f9 NG	97	/* Fall through for [0, VEC_SIZE] as its the hottest. */
	98	ja L(more_1x_vec)
	99
	100	/* Create mask of bytes that are guranteed to be valid because
2d2493a6 NG	101	of length (edx). Using masked movs allows us to skip checks
	102	for page crosses/zero size. */
	103	mov $-1, %VRAX
	104	bzhi %VRDX, %VRAX, %VRAX
	105	/* NB: A `jz` might be useful here. Page-faults that are
	106	invalidated by predicate execution (the evex mask) can be
	107	very slow. The expectation is this is not the norm so and
	108	"most" code will not regularly call 'memcmp' with length = 0
	109	and memory that is not wired up. */
	110	KMOV %VRAX, %k2
cca457f9 NG	111
	112	/* Use masked loads as VEC_SIZE could page cross where length
	113	(edx) would not. */
2d2493a6 NG	114	VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
	115	VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
	116	KMOV %k1, %VRAX
	117	TO_32BIT (VRAX)
cca457f9	118	ret
9b7cfab1	119
2d2493a6	120	.p2align 4,, 3
cca457f9	121	L(last_1x_vec):
2d2493a6 NG	122	VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
	123	VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
	124	KMOV %k1, %VRAX
	125	TO_32BIT_P1 (rax)
cca457f9	126	L(return_neq0):
2d2493a6	127	TO_32BIT_P2 (rax)
cca457f9 NG	128	ret
	129
	130
2d2493a6	131	.p2align 4,, 12
cca457f9 NG	132	L(more_1x_vec):
cca457f9 NG	133	/* From VEC + 1 to 2 * VEC. */
2d2493a6	134	VMOVU (%rsi), %VMM(1)
9b7cfab1	135	/* Use compare not equals to directly check for mismatch. */
2d2493a6 NG	136	VPCMPNEQ (%rdi), %VMM(1), %k1
	137	KMOV %k1, %VRAX
	138	TEST_ZERO (rax)
9b7cfab1 NG	139	jnz L(return_neq0)
	140
	141	cmpq $(VEC_SIZE * 2), %rdx
	142	jbe L(last_1x_vec)
	143
	144	/* Check second VEC no matter what. */
2d2493a6 NG	145	VMOVU VEC_SIZE(%rsi), %VMM(2)
	146	VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
	147	KMOV %k1, %VRAX
	148	TEST_ZERO (rax)
9b7cfab1 NG	149	jnz L(return_neq0)
	150
	151	/* Less than 4 * VEC. */
	152	cmpq $(VEC_SIZE * 4), %rdx
	153	jbe L(last_2x_vec)
	154
	155	/* Check third and fourth VEC no matter what. */
2d2493a6 NG	156	VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
	157	VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
	158	KMOV %k1, %VRAX
	159	TEST_ZERO_VCMP (rax)
9b7cfab1 NG	160	jnz L(return_neq0)
9b7cfab1 NG	161
2d2493a6 NG	162	VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
	163	VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
	164	KMOV %k1, %VRAX
	165	TEST_ZERO_VCMP (rax)
9b7cfab1 NG	166	jnz L(return_neq0)
	167
	168	/* Go to 4x VEC loop. */
	169	cmpq $(VEC_SIZE * 8), %rdx
	170	ja L(more_8x_vec)
	171
	172	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
	173	branches. */
	174
2d2493a6 NG	175	VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
2d2493a6 NG	176	VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
9b7cfab1 NG	177	addq %rdx, %rdi
	178
	179	/* Wait to load from s1 until addressed adjust due to
	180	unlamination. */
	181
	182	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
	183	will have some 1s. */
2d2493a6 NG	184	vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
	185	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
	186	oring with VEC(1). Result is stored in VEC(1). */
	187	vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
	188
	189	cmpl $(VEC_SIZE * 6), %edx
	190	jbe L(4x_last_2x_vec)
	191
	192	VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
	193	vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
	194	/* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
	195	VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
	196	vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
	197
	198	/* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
	199	vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
	200
	201	/* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
	202	L(4x_last_2x_vec):
	203	VPTEST %VMM(2), %VMM(2), %k1
	204	KMOV %k1, %VRAX
	205	TO_32BIT (VRAX)
9b7cfab1 NG	206	ret
9b7cfab1 NG	207
2d2493a6 NG	208
2d2493a6 NG	209	.p2align 4,, 10
9b7cfab1 NG	210	L(more_8x_vec):
	211	/* Set end of s1 in rdx. */
	212	leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
	213	/* rsi stores s2 - s1. This allows loop to only update one
	214	pointer. */
	215	subq %rdi, %rsi
	216	/* Align s1 pointer. */
	217	andq $-VEC_SIZE, %rdi
	218	/* Adjust because first 4x vec where check already. */
	219	subq $-(VEC_SIZE * 4), %rdi
2d2493a6 NG	220	.p2align 5,, 12
2d2493a6 NG	221	.p2align 4,, 8
9b7cfab1	222	L(loop_4x_vec):
2d2493a6 NG	223	VMOVU (%rsi, %rdi), %VMM(1)
2d2493a6 NG	224	vpxorq (%rdi), %VMM(1), %VMM(1)
9b7cfab1	225
2d2493a6 NG	226	VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
2d2493a6 NG	227	vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
9b7cfab1	228
2d2493a6 NG	229	VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
2d2493a6 NG	230	vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
9b7cfab1	231
2d2493a6 NG	232	VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
2d2493a6 NG	233	vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
9b7cfab1	234
2d2493a6 NG	235	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
	236	VPTEST %VMM(4), %VMM(4), %k1
	237	KMOV %k1, %VRAX
	238	TEST_ZERO (rax)
9b7cfab1 NG	239	jnz L(return_neq2)
	240	subq $-(VEC_SIZE * 4), %rdi
	241	cmpq %rdx, %rdi
	242	jb L(loop_4x_vec)
	243
	244	subq %rdx, %rdi
2d2493a6 NG	245
	246	VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
	247	vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
9b7cfab1	248	/* rdi has 4 * VEC_SIZE - remaining length. */
2d2493a6	249
9b7cfab1	250	/* Load regardless of branch. */
2d2493a6 NG	251	VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
	252	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
	253	oring with VEC(4). Result is stored in VEC(4). */
	254	vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
	255
	256	/* Seperate logic as we can only use testb for VEC_SIZE == 64.
	257	*/
	258	# if VEC_SIZE == 64
	259	testb %dil, %dil
	260	js L(8x_last_2x_vec)
	261	# else
9b7cfab1	262	cmpl $(VEC_SIZE * 2), %edi
2d2493a6 NG	263	jge L(8x_last_2x_vec)
2d2493a6 NG	264	# endif
9b7cfab1	265
2d2493a6 NG	266	VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
2d2493a6 NG	267	vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
9b7cfab1	268
2d2493a6 NG	269	VMOVU (%rsi, %rdx), %VMM(1)
2d2493a6 NG	270	vpxorq (%rdx), %VMM(1), %VMM(1)
9b7cfab1	271
2d2493a6	272	vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
9b7cfab1 NG	273	L(8x_last_1x_vec):
9b7cfab1 NG	274	L(8x_last_2x_vec):
2d2493a6 NG	275	VPTEST %VMM(4), %VMM(4), %k1
	276	KMOV %k1, %VRAX
	277	TO_32BIT_P1 (rax)
9b7cfab1	278	L(return_neq2):
2d2493a6	279	TO_32BIT_P2 (rax)
9b7cfab1 NG	280	ret
9b7cfab1 NG	281
2d2493a6	282	.p2align 4,, 4
cca457f9	283	L(last_2x_vec):
2d2493a6 NG	284	VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
	285	vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
	286	VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
	287	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
	288	VPTEST %VMM(2), %VMM(2), %k1
	289	KMOV %k1, %VRAX
	290	TO_32BIT (VRAX)
9b7cfab1 NG	291	ret
9b7cfab1 NG	292
2d2493a6 NG	293	/* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
2d2493a6 NG	294	next cache line. */
9b7cfab1 NG	295	END (MEMCMPEQ)
9b7cfab1 NG	296	#endif