]>
Commit | Line | Data |
---|---|---|
cf4fd28e | 1 | /* __memcmpeq optimized with EVEX. |
6d7e8eda | 2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
cf4fd28e NG |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <https://www.gnu.org/licenses/>. */ | |
18 | ||
ae308947 NG |
19 | #include <isa-level.h> |
20 | ||
21 | #if ISA_SHOULD_BUILD (4) | |
9b7cfab1 NG |
22 | |
23 | /* __memcmpeq is implemented as: | |
24 | 1. Use ymm vector compares when possible. The only case where | |
25 | vector compares is not possible for when size < VEC_SIZE | |
26 | and loading from either s1 or s2 would cause a page cross. | |
27 | 2. Use xmm vector compare when size >= 8 bytes. | |
28 | 3. Optimistically compare up to first 4 * VEC_SIZE one at a | |
29 | to check for early mismatches. Only do this if its guranteed the | |
30 | work is not wasted. | |
31 | 4. If size is 8 * VEC_SIZE or less, unroll the loop. | |
32 | 5. Compare 4 * VEC_SIZE at a time with the aligned first memory | |
33 | area. | |
34 | 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. | |
35 | 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. | |
36 | 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ | |
37 | ||
38 | # include <sysdep.h> | |
39 | ||
40 | # ifndef MEMCMPEQ | |
41 | # define MEMCMPEQ __memcmpeq_evex | |
42 | # endif | |
43 | ||
2d2493a6 NG |
44 | # ifndef VEC_SIZE |
45 | # include "x86-evex512-vecs.h" | |
46 | # endif | |
47 | # include "reg-macros.h" | |
48 | ||
49 | ||
50 | # if VEC_SIZE == 32 | |
51 | ||
52 | # define TEST_ZERO_VCMP(reg) inc %VGPR(reg) | |
53 | # define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg) | |
54 | ||
55 | # define TO_32BIT_P1(reg) /* Do nothing. */ | |
56 | # define TO_32BIT_P2(reg) /* Do nothing. */ | |
57 | # define TO_32BIT(reg) /* Do nothing. */ | |
58 | ||
59 | # define VEC_CMP VPCMPEQ | |
60 | ||
61 | # elif VEC_SIZE == 64 | |
62 | ||
63 | # define TEST_ZERO_VCMP(reg) TEST_ZERO(reg) | |
64 | # define TEST_ZERO(reg) neg %VGPR(reg) | |
65 | ||
66 | ||
67 | /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit | |
68 | int. We have two methods for this. If the mask with branched | |
69 | on, we use `neg` for the branch then `sbb` to get the 32-bit | |
70 | return. If the mask was no branched on, we just use | |
71 | `popcntq`. */ | |
72 | # define TO_32BIT_P1(reg) TEST_ZERO(reg) | |
73 | # define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32) | |
74 | # define TO_32BIT(reg) popcntq %reg, %reg | |
75 | ||
76 | # define VEC_CMP VPCMPNEQ | |
77 | ||
78 | # else | |
79 | # error "Unsupported VEC_SIZE" | |
80 | # endif | |
81 | ||
82 | ||
cca457f9 | 83 | # define VMOVU_MASK vmovdqu8 |
2d2493a6 NG |
84 | # define VPCMPNEQ vpcmpneqb |
85 | # define VPCMPEQ vpcmpeqb | |
9b7cfab1 NG |
86 | # define VPTEST vptestmb |
87 | ||
9b7cfab1 NG |
88 | # define PAGE_SIZE 4096 |
89 | ||
2d2493a6 | 90 | .section SECTION(.text), "ax", @progbits |
9b7cfab1 NG |
91 | ENTRY_P2ALIGN (MEMCMPEQ, 6) |
92 | # ifdef __ILP32__ | |
93 | /* Clear the upper 32 bits. */ | |
94 | movl %edx, %edx | |
95 | # endif | |
96 | cmp $VEC_SIZE, %RDX_LP | |
cca457f9 NG |
97 | /* Fall through for [0, VEC_SIZE] as its the hottest. */ |
98 | ja L(more_1x_vec) | |
99 | ||
100 | /* Create mask of bytes that are guranteed to be valid because | |
2d2493a6 NG |
101 | of length (edx). Using masked movs allows us to skip checks |
102 | for page crosses/zero size. */ | |
103 | mov $-1, %VRAX | |
104 | bzhi %VRDX, %VRAX, %VRAX | |
105 | /* NB: A `jz` might be useful here. Page-faults that are | |
106 | invalidated by predicate execution (the evex mask) can be | |
107 | very slow. The expectation is this is not the norm so and | |
108 | "most" code will not regularly call 'memcmp' with length = 0 | |
109 | and memory that is not wired up. */ | |
110 | KMOV %VRAX, %k2 | |
cca457f9 NG |
111 | |
112 | /* Use masked loads as VEC_SIZE could page cross where length | |
113 | (edx) would not. */ | |
2d2493a6 NG |
114 | VMOVU_MASK (%rsi), %VMM(2){%k2}{z} |
115 | VPCMPNEQ (%rdi), %VMM(2), %k1{%k2} | |
116 | KMOV %k1, %VRAX | |
117 | TO_32BIT (VRAX) | |
cca457f9 | 118 | ret |
9b7cfab1 | 119 | |
2d2493a6 | 120 | .p2align 4,, 3 |
cca457f9 | 121 | L(last_1x_vec): |
2d2493a6 NG |
122 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) |
123 | VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1 | |
124 | KMOV %k1, %VRAX | |
125 | TO_32BIT_P1 (rax) | |
cca457f9 | 126 | L(return_neq0): |
2d2493a6 | 127 | TO_32BIT_P2 (rax) |
cca457f9 NG |
128 | ret |
129 | ||
130 | ||
2d2493a6 | 131 | .p2align 4,, 12 |
cca457f9 NG |
132 | L(more_1x_vec): |
133 | /* From VEC + 1 to 2 * VEC. */ | |
2d2493a6 | 134 | VMOVU (%rsi), %VMM(1) |
9b7cfab1 | 135 | /* Use compare not equals to directly check for mismatch. */ |
2d2493a6 NG |
136 | VPCMPNEQ (%rdi), %VMM(1), %k1 |
137 | KMOV %k1, %VRAX | |
138 | TEST_ZERO (rax) | |
9b7cfab1 NG |
139 | jnz L(return_neq0) |
140 | ||
141 | cmpq $(VEC_SIZE * 2), %rdx | |
142 | jbe L(last_1x_vec) | |
143 | ||
144 | /* Check second VEC no matter what. */ | |
2d2493a6 NG |
145 | VMOVU VEC_SIZE(%rsi), %VMM(2) |
146 | VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1 | |
147 | KMOV %k1, %VRAX | |
148 | TEST_ZERO (rax) | |
9b7cfab1 NG |
149 | jnz L(return_neq0) |
150 | ||
151 | /* Less than 4 * VEC. */ | |
152 | cmpq $(VEC_SIZE * 4), %rdx | |
153 | jbe L(last_2x_vec) | |
154 | ||
155 | /* Check third and fourth VEC no matter what. */ | |
2d2493a6 NG |
156 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) |
157 | VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 | |
158 | KMOV %k1, %VRAX | |
159 | TEST_ZERO_VCMP (rax) | |
9b7cfab1 NG |
160 | jnz L(return_neq0) |
161 | ||
2d2493a6 NG |
162 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) |
163 | VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 | |
164 | KMOV %k1, %VRAX | |
165 | TEST_ZERO_VCMP (rax) | |
9b7cfab1 NG |
166 | jnz L(return_neq0) |
167 | ||
168 | /* Go to 4x VEC loop. */ | |
169 | cmpq $(VEC_SIZE * 8), %rdx | |
170 | ja L(more_8x_vec) | |
171 | ||
172 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any | |
173 | branches. */ | |
174 | ||
2d2493a6 NG |
175 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) |
176 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) | |
9b7cfab1 NG |
177 | addq %rdx, %rdi |
178 | ||
179 | /* Wait to load from s1 until addressed adjust due to | |
180 | unlamination. */ | |
181 | ||
182 | /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it | |
183 | will have some 1s. */ | |
2d2493a6 NG |
184 | vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1) |
185 | /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while | |
186 | oring with VEC(1). Result is stored in VEC(1). */ | |
187 | vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2) | |
188 | ||
189 | cmpl $(VEC_SIZE * 6), %edx | |
190 | jbe L(4x_last_2x_vec) | |
191 | ||
192 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3) | |
193 | vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3) | |
194 | /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */ | |
195 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4) | |
196 | vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4) | |
197 | ||
198 | /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */ | |
199 | vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2) | |
200 | ||
201 | /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */ | |
202 | L(4x_last_2x_vec): | |
203 | VPTEST %VMM(2), %VMM(2), %k1 | |
204 | KMOV %k1, %VRAX | |
205 | TO_32BIT (VRAX) | |
9b7cfab1 NG |
206 | ret |
207 | ||
2d2493a6 NG |
208 | |
209 | .p2align 4,, 10 | |
9b7cfab1 NG |
210 | L(more_8x_vec): |
211 | /* Set end of s1 in rdx. */ | |
212 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx | |
213 | /* rsi stores s2 - s1. This allows loop to only update one | |
214 | pointer. */ | |
215 | subq %rdi, %rsi | |
216 | /* Align s1 pointer. */ | |
217 | andq $-VEC_SIZE, %rdi | |
218 | /* Adjust because first 4x vec where check already. */ | |
219 | subq $-(VEC_SIZE * 4), %rdi | |
2d2493a6 NG |
220 | .p2align 5,, 12 |
221 | .p2align 4,, 8 | |
9b7cfab1 | 222 | L(loop_4x_vec): |
2d2493a6 NG |
223 | VMOVU (%rsi, %rdi), %VMM(1) |
224 | vpxorq (%rdi), %VMM(1), %VMM(1) | |
9b7cfab1 | 225 | |
2d2493a6 NG |
226 | VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) |
227 | vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2) | |
9b7cfab1 | 228 | |
2d2493a6 NG |
229 | VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) |
230 | vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) | |
9b7cfab1 | 231 | |
2d2493a6 NG |
232 | VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) |
233 | vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4) | |
9b7cfab1 | 234 | |
2d2493a6 NG |
235 | vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) |
236 | VPTEST %VMM(4), %VMM(4), %k1 | |
237 | KMOV %k1, %VRAX | |
238 | TEST_ZERO (rax) | |
9b7cfab1 NG |
239 | jnz L(return_neq2) |
240 | subq $-(VEC_SIZE * 4), %rdi | |
241 | cmpq %rdx, %rdi | |
242 | jb L(loop_4x_vec) | |
243 | ||
244 | subq %rdx, %rdi | |
2d2493a6 NG |
245 | |
246 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) | |
247 | vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4) | |
9b7cfab1 | 248 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
2d2493a6 | 249 | |
9b7cfab1 | 250 | /* Load regardless of branch. */ |
2d2493a6 NG |
251 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) |
252 | /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while | |
253 | oring with VEC(4). Result is stored in VEC(4). */ | |
254 | vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4) | |
255 | ||
256 | /* Seperate logic as we can only use testb for VEC_SIZE == 64. | |
257 | */ | |
258 | # if VEC_SIZE == 64 | |
259 | testb %dil, %dil | |
260 | js L(8x_last_2x_vec) | |
261 | # else | |
9b7cfab1 | 262 | cmpl $(VEC_SIZE * 2), %edi |
2d2493a6 NG |
263 | jge L(8x_last_2x_vec) |
264 | # endif | |
9b7cfab1 | 265 | |
2d2493a6 NG |
266 | VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) |
267 | vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) | |
9b7cfab1 | 268 | |
2d2493a6 NG |
269 | VMOVU (%rsi, %rdx), %VMM(1) |
270 | vpxorq (%rdx), %VMM(1), %VMM(1) | |
9b7cfab1 | 271 | |
2d2493a6 | 272 | vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4) |
9b7cfab1 NG |
273 | L(8x_last_1x_vec): |
274 | L(8x_last_2x_vec): | |
2d2493a6 NG |
275 | VPTEST %VMM(4), %VMM(4), %k1 |
276 | KMOV %k1, %VRAX | |
277 | TO_32BIT_P1 (rax) | |
9b7cfab1 | 278 | L(return_neq2): |
2d2493a6 | 279 | TO_32BIT_P2 (rax) |
9b7cfab1 NG |
280 | ret |
281 | ||
2d2493a6 | 282 | .p2align 4,, 4 |
cca457f9 | 283 | L(last_2x_vec): |
2d2493a6 NG |
284 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1) |
285 | vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1) | |
286 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2) | |
287 | vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2) | |
288 | VPTEST %VMM(2), %VMM(2), %k1 | |
289 | KMOV %k1, %VRAX | |
290 | TO_32BIT (VRAX) | |
9b7cfab1 NG |
291 | ret |
292 | ||
2d2493a6 NG |
293 | /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from |
294 | next cache line. */ | |
9b7cfab1 NG |
295 | END (MEMCMPEQ) |
296 | #endif |