]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/memcmpeq-evex.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcmpeq-evex.S
CommitLineData
cf4fd28e 1/* __memcmpeq optimized with EVEX.
6d7e8eda 2 Copyright (C) 2017-2023 Free Software Foundation, Inc.
cf4fd28e
NG
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
ae308947
NG
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
9b7cfab1
NG
22
23/* __memcmpeq is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. Use xmm vector compare when size >= 8 bytes.
28 3. Optimistically compare up to first 4 * VEC_SIZE one at a
29 to check for early mismatches. Only do this if its guranteed the
30 work is not wasted.
31 4. If size is 8 * VEC_SIZE or less, unroll the loop.
32 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
33 area.
34 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
35 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
36 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
37
38# include <sysdep.h>
39
40# ifndef MEMCMPEQ
41# define MEMCMPEQ __memcmpeq_evex
42# endif
43
2d2493a6
NG
44# ifndef VEC_SIZE
45# include "x86-evex512-vecs.h"
46# endif
47# include "reg-macros.h"
48
49
50# if VEC_SIZE == 32
51
52# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
53# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
54
55# define TO_32BIT_P1(reg) /* Do nothing. */
56# define TO_32BIT_P2(reg) /* Do nothing. */
57# define TO_32BIT(reg) /* Do nothing. */
58
59# define VEC_CMP VPCMPEQ
60
61# elif VEC_SIZE == 64
62
63# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
64# define TEST_ZERO(reg) neg %VGPR(reg)
65
66
67 /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
68 int. We have two methods for this. If the mask with branched
69 on, we use `neg` for the branch then `sbb` to get the 32-bit
70 return. If the mask was no branched on, we just use
71 `popcntq`. */
72# define TO_32BIT_P1(reg) TEST_ZERO(reg)
73# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
74# define TO_32BIT(reg) popcntq %reg, %reg
75
76# define VEC_CMP VPCMPNEQ
77
78# else
79# error "Unsupported VEC_SIZE"
80# endif
81
82
cca457f9 83# define VMOVU_MASK vmovdqu8
2d2493a6
NG
84# define VPCMPNEQ vpcmpneqb
85# define VPCMPEQ vpcmpeqb
9b7cfab1
NG
86# define VPTEST vptestmb
87
9b7cfab1
NG
88# define PAGE_SIZE 4096
89
2d2493a6 90 .section SECTION(.text), "ax", @progbits
9b7cfab1
NG
91ENTRY_P2ALIGN (MEMCMPEQ, 6)
92# ifdef __ILP32__
93 /* Clear the upper 32 bits. */
94 movl %edx, %edx
95# endif
96 cmp $VEC_SIZE, %RDX_LP
cca457f9
NG
97 /* Fall through for [0, VEC_SIZE] as its the hottest. */
98 ja L(more_1x_vec)
99
100 /* Create mask of bytes that are guranteed to be valid because
2d2493a6
NG
101 of length (edx). Using masked movs allows us to skip checks
102 for page crosses/zero size. */
103 mov $-1, %VRAX
104 bzhi %VRDX, %VRAX, %VRAX
105 /* NB: A `jz` might be useful here. Page-faults that are
106 invalidated by predicate execution (the evex mask) can be
107 very slow. The expectation is this is not the norm so and
108 "most" code will not regularly call 'memcmp' with length = 0
109 and memory that is not wired up. */
110 KMOV %VRAX, %k2
cca457f9
NG
111
112 /* Use masked loads as VEC_SIZE could page cross where length
113 (edx) would not. */
2d2493a6
NG
114 VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
115 VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
116 KMOV %k1, %VRAX
117 TO_32BIT (VRAX)
cca457f9 118 ret
9b7cfab1 119
2d2493a6 120 .p2align 4,, 3
cca457f9 121L(last_1x_vec):
2d2493a6
NG
122 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
123 VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
124 KMOV %k1, %VRAX
125 TO_32BIT_P1 (rax)
cca457f9 126L(return_neq0):
2d2493a6 127 TO_32BIT_P2 (rax)
cca457f9
NG
128 ret
129
130
2d2493a6 131 .p2align 4,, 12
cca457f9
NG
132L(more_1x_vec):
133 /* From VEC + 1 to 2 * VEC. */
2d2493a6 134 VMOVU (%rsi), %VMM(1)
9b7cfab1 135 /* Use compare not equals to directly check for mismatch. */
2d2493a6
NG
136 VPCMPNEQ (%rdi), %VMM(1), %k1
137 KMOV %k1, %VRAX
138 TEST_ZERO (rax)
9b7cfab1
NG
139 jnz L(return_neq0)
140
141 cmpq $(VEC_SIZE * 2), %rdx
142 jbe L(last_1x_vec)
143
144 /* Check second VEC no matter what. */
2d2493a6
NG
145 VMOVU VEC_SIZE(%rsi), %VMM(2)
146 VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
147 KMOV %k1, %VRAX
148 TEST_ZERO (rax)
9b7cfab1
NG
149 jnz L(return_neq0)
150
151 /* Less than 4 * VEC. */
152 cmpq $(VEC_SIZE * 4), %rdx
153 jbe L(last_2x_vec)
154
155 /* Check third and fourth VEC no matter what. */
2d2493a6
NG
156 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
157 VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
158 KMOV %k1, %VRAX
159 TEST_ZERO_VCMP (rax)
9b7cfab1
NG
160 jnz L(return_neq0)
161
2d2493a6
NG
162 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
163 VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
164 KMOV %k1, %VRAX
165 TEST_ZERO_VCMP (rax)
9b7cfab1
NG
166 jnz L(return_neq0)
167
168 /* Go to 4x VEC loop. */
169 cmpq $(VEC_SIZE * 8), %rdx
170 ja L(more_8x_vec)
171
172 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
173 branches. */
174
2d2493a6
NG
175 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
176 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
9b7cfab1
NG
177 addq %rdx, %rdi
178
179 /* Wait to load from s1 until addressed adjust due to
180 unlamination. */
181
182 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
183 will have some 1s. */
2d2493a6
NG
184 vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
185 /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
186 oring with VEC(1). Result is stored in VEC(1). */
187 vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
188
189 cmpl $(VEC_SIZE * 6), %edx
190 jbe L(4x_last_2x_vec)
191
192 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
193 vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
194 /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
195 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
196 vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
197
198 /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
199 vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
200
201 /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
202L(4x_last_2x_vec):
203 VPTEST %VMM(2), %VMM(2), %k1
204 KMOV %k1, %VRAX
205 TO_32BIT (VRAX)
9b7cfab1
NG
206 ret
207
2d2493a6
NG
208
209 .p2align 4,, 10
9b7cfab1
NG
210L(more_8x_vec):
211 /* Set end of s1 in rdx. */
212 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
213 /* rsi stores s2 - s1. This allows loop to only update one
214 pointer. */
215 subq %rdi, %rsi
216 /* Align s1 pointer. */
217 andq $-VEC_SIZE, %rdi
218 /* Adjust because first 4x vec where check already. */
219 subq $-(VEC_SIZE * 4), %rdi
2d2493a6
NG
220 .p2align 5,, 12
221 .p2align 4,, 8
9b7cfab1 222L(loop_4x_vec):
2d2493a6
NG
223 VMOVU (%rsi, %rdi), %VMM(1)
224 vpxorq (%rdi), %VMM(1), %VMM(1)
9b7cfab1 225
2d2493a6
NG
226 VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
227 vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
9b7cfab1 228
2d2493a6
NG
229 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
230 vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
9b7cfab1 231
2d2493a6
NG
232 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
233 vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
9b7cfab1 234
2d2493a6
NG
235 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
236 VPTEST %VMM(4), %VMM(4), %k1
237 KMOV %k1, %VRAX
238 TEST_ZERO (rax)
9b7cfab1
NG
239 jnz L(return_neq2)
240 subq $-(VEC_SIZE * 4), %rdi
241 cmpq %rdx, %rdi
242 jb L(loop_4x_vec)
243
244 subq %rdx, %rdi
2d2493a6
NG
245
246 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
247 vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
9b7cfab1 248 /* rdi has 4 * VEC_SIZE - remaining length. */
2d2493a6 249
9b7cfab1 250 /* Load regardless of branch. */
2d2493a6
NG
251 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
252 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
253 oring with VEC(4). Result is stored in VEC(4). */
254 vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
255
256 /* Seperate logic as we can only use testb for VEC_SIZE == 64.
257 */
258# if VEC_SIZE == 64
259 testb %dil, %dil
260 js L(8x_last_2x_vec)
261# else
9b7cfab1 262 cmpl $(VEC_SIZE * 2), %edi
2d2493a6
NG
263 jge L(8x_last_2x_vec)
264# endif
9b7cfab1 265
2d2493a6
NG
266 VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
267 vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
9b7cfab1 268
2d2493a6
NG
269 VMOVU (%rsi, %rdx), %VMM(1)
270 vpxorq (%rdx), %VMM(1), %VMM(1)
9b7cfab1 271
2d2493a6 272 vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
9b7cfab1
NG
273L(8x_last_1x_vec):
274L(8x_last_2x_vec):
2d2493a6
NG
275 VPTEST %VMM(4), %VMM(4), %k1
276 KMOV %k1, %VRAX
277 TO_32BIT_P1 (rax)
9b7cfab1 278L(return_neq2):
2d2493a6 279 TO_32BIT_P2 (rax)
9b7cfab1
NG
280 ret
281
2d2493a6 282 .p2align 4,, 4
cca457f9 283L(last_2x_vec):
2d2493a6
NG
284 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
285 vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
286 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
287 vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
288 VPTEST %VMM(2), %VMM(2), %k1
289 KMOV %k1, %VRAX
290 TO_32BIT (VRAX)
9b7cfab1
NG
291 ret
292
2d2493a6
NG
293 /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
294 next cache line. */
9b7cfab1
NG
295END (MEMCMPEQ)
296#endif