]>
Commit | Line | Data |
---|---|---|
064f01b1 | 1 | /* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. |
04277e02 | 2 | Copyright (C) 2016-2019 Free Software Foundation, Inc. |
72276d6e AS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
064f01b1 L |
19 | #include <sysdep.h> |
20 | ||
f43cb35c | 21 | #if IS_IN (libc) |
064f01b1 L |
22 | |
23 | # include "asm-syntax.h" | |
24 | ||
25 | .section .text.avx512,"ax",@progbits | |
064f01b1 L |
26 | ENTRY (__mempcpy_chk_avx512_no_vzeroupper) |
27 | cmpq %rdx, %rcx | |
28 | jb HIDDEN_JUMPTARGET (__chk_fail) | |
29 | END (__mempcpy_chk_avx512_no_vzeroupper) | |
30 | ||
31 | ENTRY (__mempcpy_avx512_no_vzeroupper) | |
32 | movq %rdi, %rax | |
33 | addq %rdx, %rax | |
34 | jmp L(start) | |
35 | END (__mempcpy_avx512_no_vzeroupper) | |
064f01b1 | 36 | |
064f01b1 L |
37 | ENTRY (__memmove_chk_avx512_no_vzeroupper) |
38 | cmpq %rdx, %rcx | |
39 | jb HIDDEN_JUMPTARGET (__chk_fail) | |
40 | END (__memmove_chk_avx512_no_vzeroupper) | |
064f01b1 L |
41 | |
42 | ENTRY (__memmove_avx512_no_vzeroupper) | |
43 | mov %rdi, %rax | |
44 | # ifdef USE_AS_MEMPCPY | |
45 | add %rdx, %rax | |
46 | # endif | |
47 | L(start): | |
48 | lea (%rsi, %rdx), %rcx | |
49 | lea (%rdi, %rdx), %r9 | |
50 | cmp $512, %rdx | |
51 | ja L(512bytesormore) | |
52 | ||
53 | L(check): | |
54 | cmp $16, %rdx | |
55 | jbe L(less_16bytes) | |
56 | cmp $256, %rdx | |
57 | jb L(less_256bytes) | |
58 | vmovups (%rsi), %zmm0 | |
59 | vmovups 0x40(%rsi), %zmm1 | |
60 | vmovups 0x80(%rsi), %zmm2 | |
61 | vmovups 0xC0(%rsi), %zmm3 | |
62 | vmovups -0x100(%rcx), %zmm4 | |
63 | vmovups -0xC0(%rcx), %zmm5 | |
64 | vmovups -0x80(%rcx), %zmm6 | |
65 | vmovups -0x40(%rcx), %zmm7 | |
66 | vmovups %zmm0, (%rdi) | |
67 | vmovups %zmm1, 0x40(%rdi) | |
68 | vmovups %zmm2, 0x80(%rdi) | |
69 | vmovups %zmm3, 0xC0(%rdi) | |
70 | vmovups %zmm4, -0x100(%r9) | |
71 | vmovups %zmm5, -0xC0(%r9) | |
72 | vmovups %zmm6, -0x80(%r9) | |
73 | vmovups %zmm7, -0x40(%r9) | |
74 | ret | |
75 | ||
76 | L(less_256bytes): | |
77 | cmp $128, %dl | |
78 | jb L(less_128bytes) | |
79 | vmovups (%rsi), %zmm0 | |
80 | vmovups 0x40(%rsi), %zmm1 | |
81 | vmovups -0x80(%rcx), %zmm2 | |
82 | vmovups -0x40(%rcx), %zmm3 | |
83 | vmovups %zmm0, (%rdi) | |
84 | vmovups %zmm1, 0x40(%rdi) | |
85 | vmovups %zmm2, -0x80(%r9) | |
86 | vmovups %zmm3, -0x40(%r9) | |
87 | ret | |
88 | ||
89 | L(less_128bytes): | |
90 | cmp $64, %dl | |
91 | jb L(less_64bytes) | |
92 | vmovdqu (%rsi), %ymm0 | |
93 | vmovdqu 0x20(%rsi), %ymm1 | |
94 | vmovdqu -0x40(%rcx), %ymm2 | |
95 | vmovdqu -0x20(%rcx), %ymm3 | |
96 | vmovdqu %ymm0, (%rdi) | |
97 | vmovdqu %ymm1, 0x20(%rdi) | |
98 | vmovdqu %ymm2, -0x40(%r9) | |
99 | vmovdqu %ymm3, -0x20(%r9) | |
100 | ret | |
101 | ||
102 | L(less_64bytes): | |
103 | cmp $32, %dl | |
104 | jb L(less_32bytes) | |
105 | vmovdqu (%rsi), %ymm0 | |
106 | vmovdqu -0x20(%rcx), %ymm1 | |
107 | vmovdqu %ymm0, (%rdi) | |
108 | vmovdqu %ymm1, -0x20(%r9) | |
109 | ret | |
110 | ||
111 | L(less_32bytes): | |
112 | vmovdqu (%rsi), %xmm0 | |
113 | vmovdqu -0x10(%rcx), %xmm1 | |
114 | vmovdqu %xmm0, (%rdi) | |
115 | vmovdqu %xmm1, -0x10(%r9) | |
116 | ret | |
117 | ||
118 | L(less_16bytes): | |
119 | cmp $8, %dl | |
120 | jb L(less_8bytes) | |
121 | movq (%rsi), %rsi | |
122 | movq -0x8(%rcx), %rcx | |
123 | movq %rsi, (%rdi) | |
124 | movq %rcx, -0x8(%r9) | |
125 | ret | |
126 | ||
127 | L(less_8bytes): | |
128 | cmp $4, %dl | |
129 | jb L(less_4bytes) | |
130 | mov (%rsi), %esi | |
131 | mov -0x4(%rcx), %ecx | |
132 | mov %esi, (%rdi) | |
133 | mov %ecx, -0x4(%r9) | |
134 | ret | |
135 | ||
136 | L(less_4bytes): | |
137 | cmp $2, %dl | |
138 | jb L(less_2bytes) | |
139 | mov (%rsi), %si | |
140 | mov -0x2(%rcx), %cx | |
141 | mov %si, (%rdi) | |
142 | mov %cx, -0x2(%r9) | |
143 | ret | |
144 | ||
145 | L(less_2bytes): | |
146 | cmp $1, %dl | |
147 | jb L(less_1bytes) | |
148 | mov (%rsi), %cl | |
149 | mov %cl, (%rdi) | |
150 | L(less_1bytes): | |
151 | ret | |
152 | ||
153 | L(512bytesormore): | |
154 | # ifdef SHARED_CACHE_SIZE_HALF | |
155 | mov $SHARED_CACHE_SIZE_HALF, %r8 | |
156 | # else | |
157 | mov __x86_shared_cache_size_half(%rip), %r8 | |
158 | # endif | |
159 | cmp %r8, %rdx | |
160 | jae L(preloop_large) | |
161 | cmp $1024, %rdx | |
162 | ja L(1024bytesormore) | |
163 | prefetcht1 (%rsi) | |
164 | prefetcht1 0x40(%rsi) | |
165 | prefetcht1 0x80(%rsi) | |
166 | prefetcht1 0xC0(%rsi) | |
167 | prefetcht1 0x100(%rsi) | |
168 | prefetcht1 0x140(%rsi) | |
169 | prefetcht1 0x180(%rsi) | |
170 | prefetcht1 0x1C0(%rsi) | |
171 | prefetcht1 -0x200(%rcx) | |
172 | prefetcht1 -0x1C0(%rcx) | |
173 | prefetcht1 -0x180(%rcx) | |
174 | prefetcht1 -0x140(%rcx) | |
175 | prefetcht1 -0x100(%rcx) | |
176 | prefetcht1 -0xC0(%rcx) | |
177 | prefetcht1 -0x80(%rcx) | |
178 | prefetcht1 -0x40(%rcx) | |
179 | vmovups (%rsi), %zmm0 | |
180 | vmovups 0x40(%rsi), %zmm1 | |
181 | vmovups 0x80(%rsi), %zmm2 | |
182 | vmovups 0xC0(%rsi), %zmm3 | |
183 | vmovups 0x100(%rsi), %zmm4 | |
184 | vmovups 0x140(%rsi), %zmm5 | |
185 | vmovups 0x180(%rsi), %zmm6 | |
186 | vmovups 0x1C0(%rsi), %zmm7 | |
187 | vmovups -0x200(%rcx), %zmm8 | |
188 | vmovups -0x1C0(%rcx), %zmm9 | |
189 | vmovups -0x180(%rcx), %zmm10 | |
190 | vmovups -0x140(%rcx), %zmm11 | |
191 | vmovups -0x100(%rcx), %zmm12 | |
192 | vmovups -0xC0(%rcx), %zmm13 | |
193 | vmovups -0x80(%rcx), %zmm14 | |
194 | vmovups -0x40(%rcx), %zmm15 | |
195 | vmovups %zmm0, (%rdi) | |
196 | vmovups %zmm1, 0x40(%rdi) | |
197 | vmovups %zmm2, 0x80(%rdi) | |
198 | vmovups %zmm3, 0xC0(%rdi) | |
199 | vmovups %zmm4, 0x100(%rdi) | |
200 | vmovups %zmm5, 0x140(%rdi) | |
201 | vmovups %zmm6, 0x180(%rdi) | |
202 | vmovups %zmm7, 0x1C0(%rdi) | |
203 | vmovups %zmm8, -0x200(%r9) | |
204 | vmovups %zmm9, -0x1C0(%r9) | |
205 | vmovups %zmm10, -0x180(%r9) | |
206 | vmovups %zmm11, -0x140(%r9) | |
207 | vmovups %zmm12, -0x100(%r9) | |
208 | vmovups %zmm13, -0xC0(%r9) | |
209 | vmovups %zmm14, -0x80(%r9) | |
210 | vmovups %zmm15, -0x40(%r9) | |
211 | ret | |
212 | ||
213 | L(1024bytesormore): | |
214 | cmp %rsi, %rdi | |
215 | ja L(1024bytesormore_bkw) | |
216 | sub $512, %r9 | |
217 | vmovups -0x200(%rcx), %zmm8 | |
218 | vmovups -0x1C0(%rcx), %zmm9 | |
219 | vmovups -0x180(%rcx), %zmm10 | |
220 | vmovups -0x140(%rcx), %zmm11 | |
221 | vmovups -0x100(%rcx), %zmm12 | |
222 | vmovups -0xC0(%rcx), %zmm13 | |
223 | vmovups -0x80(%rcx), %zmm14 | |
224 | vmovups -0x40(%rcx), %zmm15 | |
225 | prefetcht1 (%rsi) | |
226 | prefetcht1 0x40(%rsi) | |
227 | prefetcht1 0x80(%rsi) | |
228 | prefetcht1 0xC0(%rsi) | |
229 | prefetcht1 0x100(%rsi) | |
230 | prefetcht1 0x140(%rsi) | |
231 | prefetcht1 0x180(%rsi) | |
232 | prefetcht1 0x1C0(%rsi) | |
233 | ||
234 | /* Loop with unaligned memory access. */ | |
235 | L(gobble_512bytes_loop): | |
236 | vmovups (%rsi), %zmm0 | |
237 | vmovups 0x40(%rsi), %zmm1 | |
238 | vmovups 0x80(%rsi), %zmm2 | |
239 | vmovups 0xC0(%rsi), %zmm3 | |
240 | vmovups 0x100(%rsi), %zmm4 | |
241 | vmovups 0x140(%rsi), %zmm5 | |
242 | vmovups 0x180(%rsi), %zmm6 | |
243 | vmovups 0x1C0(%rsi), %zmm7 | |
244 | add $512, %rsi | |
245 | prefetcht1 (%rsi) | |
246 | prefetcht1 0x40(%rsi) | |
247 | prefetcht1 0x80(%rsi) | |
248 | prefetcht1 0xC0(%rsi) | |
249 | prefetcht1 0x100(%rsi) | |
250 | prefetcht1 0x140(%rsi) | |
251 | prefetcht1 0x180(%rsi) | |
252 | prefetcht1 0x1C0(%rsi) | |
253 | vmovups %zmm0, (%rdi) | |
254 | vmovups %zmm1, 0x40(%rdi) | |
255 | vmovups %zmm2, 0x80(%rdi) | |
256 | vmovups %zmm3, 0xC0(%rdi) | |
257 | vmovups %zmm4, 0x100(%rdi) | |
258 | vmovups %zmm5, 0x140(%rdi) | |
259 | vmovups %zmm6, 0x180(%rdi) | |
260 | vmovups %zmm7, 0x1C0(%rdi) | |
261 | add $512, %rdi | |
262 | cmp %r9, %rdi | |
263 | jb L(gobble_512bytes_loop) | |
264 | vmovups %zmm8, (%r9) | |
265 | vmovups %zmm9, 0x40(%r9) | |
266 | vmovups %zmm10, 0x80(%r9) | |
267 | vmovups %zmm11, 0xC0(%r9) | |
268 | vmovups %zmm12, 0x100(%r9) | |
269 | vmovups %zmm13, 0x140(%r9) | |
270 | vmovups %zmm14, 0x180(%r9) | |
271 | vmovups %zmm15, 0x1C0(%r9) | |
272 | ret | |
273 | ||
274 | L(1024bytesormore_bkw): | |
275 | add $512, %rdi | |
276 | vmovups 0x1C0(%rsi), %zmm8 | |
277 | vmovups 0x180(%rsi), %zmm9 | |
278 | vmovups 0x140(%rsi), %zmm10 | |
279 | vmovups 0x100(%rsi), %zmm11 | |
280 | vmovups 0xC0(%rsi), %zmm12 | |
281 | vmovups 0x80(%rsi), %zmm13 | |
282 | vmovups 0x40(%rsi), %zmm14 | |
283 | vmovups (%rsi), %zmm15 | |
284 | prefetcht1 -0x40(%rcx) | |
285 | prefetcht1 -0x80(%rcx) | |
286 | prefetcht1 -0xC0(%rcx) | |
287 | prefetcht1 -0x100(%rcx) | |
288 | prefetcht1 -0x140(%rcx) | |
289 | prefetcht1 -0x180(%rcx) | |
290 | prefetcht1 -0x1C0(%rcx) | |
291 | prefetcht1 -0x200(%rcx) | |
292 | ||
293 | /* Backward loop with unaligned memory access. */ | |
294 | L(gobble_512bytes_loop_bkw): | |
295 | vmovups -0x40(%rcx), %zmm0 | |
296 | vmovups -0x80(%rcx), %zmm1 | |
297 | vmovups -0xC0(%rcx), %zmm2 | |
298 | vmovups -0x100(%rcx), %zmm3 | |
299 | vmovups -0x140(%rcx), %zmm4 | |
300 | vmovups -0x180(%rcx), %zmm5 | |
301 | vmovups -0x1C0(%rcx), %zmm6 | |
302 | vmovups -0x200(%rcx), %zmm7 | |
303 | sub $512, %rcx | |
304 | prefetcht1 -0x40(%rcx) | |
305 | prefetcht1 -0x80(%rcx) | |
306 | prefetcht1 -0xC0(%rcx) | |
307 | prefetcht1 -0x100(%rcx) | |
308 | prefetcht1 -0x140(%rcx) | |
309 | prefetcht1 -0x180(%rcx) | |
310 | prefetcht1 -0x1C0(%rcx) | |
311 | prefetcht1 -0x200(%rcx) | |
312 | vmovups %zmm0, -0x40(%r9) | |
313 | vmovups %zmm1, -0x80(%r9) | |
314 | vmovups %zmm2, -0xC0(%r9) | |
315 | vmovups %zmm3, -0x100(%r9) | |
316 | vmovups %zmm4, -0x140(%r9) | |
317 | vmovups %zmm5, -0x180(%r9) | |
318 | vmovups %zmm6, -0x1C0(%r9) | |
319 | vmovups %zmm7, -0x200(%r9) | |
320 | sub $512, %r9 | |
321 | cmp %rdi, %r9 | |
322 | ja L(gobble_512bytes_loop_bkw) | |
323 | vmovups %zmm8, -0x40(%rdi) | |
324 | vmovups %zmm9, -0x80(%rdi) | |
325 | vmovups %zmm10, -0xC0(%rdi) | |
326 | vmovups %zmm11, -0x100(%rdi) | |
327 | vmovups %zmm12, -0x140(%rdi) | |
328 | vmovups %zmm13, -0x180(%rdi) | |
329 | vmovups %zmm14, -0x1C0(%rdi) | |
330 | vmovups %zmm15, -0x200(%rdi) | |
331 | ret | |
332 | ||
333 | L(preloop_large): | |
334 | cmp %rsi, %rdi | |
335 | ja L(preloop_large_bkw) | |
336 | vmovups (%rsi), %zmm4 | |
337 | vmovups 0x40(%rsi), %zmm5 | |
338 | ||
9aaaab7c | 339 | mov %rdi, %r11 |
064f01b1 L |
340 | /* Align destination for access with non-temporal stores in the loop. */ |
341 | mov %rdi, %r8 | |
342 | and $-0x80, %rdi | |
343 | add $0x80, %rdi | |
344 | sub %rdi, %r8 | |
345 | sub %r8, %rsi | |
346 | add %r8, %rdx | |
347 | L(gobble_256bytes_nt_loop): | |
348 | prefetcht1 0x200(%rsi) | |
349 | prefetcht1 0x240(%rsi) | |
350 | prefetcht1 0x280(%rsi) | |
351 | prefetcht1 0x2C0(%rsi) | |
352 | prefetcht1 0x300(%rsi) | |
353 | prefetcht1 0x340(%rsi) | |
354 | prefetcht1 0x380(%rsi) | |
355 | prefetcht1 0x3C0(%rsi) | |
356 | vmovdqu64 (%rsi), %zmm0 | |
357 | vmovdqu64 0x40(%rsi), %zmm1 | |
358 | vmovdqu64 0x80(%rsi), %zmm2 | |
359 | vmovdqu64 0xC0(%rsi), %zmm3 | |
360 | vmovntdq %zmm0, (%rdi) | |
361 | vmovntdq %zmm1, 0x40(%rdi) | |
362 | vmovntdq %zmm2, 0x80(%rdi) | |
363 | vmovntdq %zmm3, 0xC0(%rdi) | |
364 | sub $256, %rdx | |
365 | add $256, %rsi | |
366 | add $256, %rdi | |
367 | cmp $256, %rdx | |
368 | ja L(gobble_256bytes_nt_loop) | |
369 | sfence | |
9aaaab7c AS |
370 | vmovups %zmm4, (%r11) |
371 | vmovups %zmm5, 0x40(%r11) | |
064f01b1 L |
372 | jmp L(check) |
373 | ||
374 | L(preloop_large_bkw): | |
375 | vmovups -0x80(%rcx), %zmm4 | |
376 | vmovups -0x40(%rcx), %zmm5 | |
377 | ||
378 | /* Align end of destination for access with non-temporal stores. */ | |
379 | mov %r9, %r8 | |
380 | and $-0x80, %r9 | |
381 | sub %r9, %r8 | |
382 | sub %r8, %rcx | |
383 | sub %r8, %rdx | |
384 | add %r9, %r8 | |
385 | L(gobble_256bytes_nt_loop_bkw): | |
386 | prefetcht1 -0x400(%rcx) | |
387 | prefetcht1 -0x3C0(%rcx) | |
388 | prefetcht1 -0x380(%rcx) | |
389 | prefetcht1 -0x340(%rcx) | |
390 | prefetcht1 -0x300(%rcx) | |
391 | prefetcht1 -0x2C0(%rcx) | |
392 | prefetcht1 -0x280(%rcx) | |
393 | prefetcht1 -0x240(%rcx) | |
394 | vmovdqu64 -0x100(%rcx), %zmm0 | |
395 | vmovdqu64 -0xC0(%rcx), %zmm1 | |
396 | vmovdqu64 -0x80(%rcx), %zmm2 | |
397 | vmovdqu64 -0x40(%rcx), %zmm3 | |
398 | vmovntdq %zmm0, -0x100(%r9) | |
399 | vmovntdq %zmm1, -0xC0(%r9) | |
400 | vmovntdq %zmm2, -0x80(%r9) | |
401 | vmovntdq %zmm3, -0x40(%r9) | |
402 | sub $256, %rdx | |
403 | sub $256, %rcx | |
404 | sub $256, %r9 | |
405 | cmp $256, %rdx | |
406 | ja L(gobble_256bytes_nt_loop_bkw) | |
407 | sfence | |
408 | vmovups %zmm4, -0x80(%r8) | |
409 | vmovups %zmm5, -0x40(%r8) | |
410 | jmp L(check) | |
411 | END (__memmove_avx512_no_vzeroupper) | |
412 | ||
064f01b1 L |
413 | strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) |
414 | strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) | |
064f01b1 | 415 | #endif |