]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-avx512-no-vzeroupper.S
1 /* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #if IS_IN (libc)
22
23 # include "asm-syntax.h"
24
25 .section .text.avx512,"ax",@progbits
26 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
27 cmpq %rdx, %rcx
28 jb HIDDEN_JUMPTARGET (__chk_fail)
29 END (__mempcpy_chk_avx512_no_vzeroupper)
30
31 ENTRY (__mempcpy_avx512_no_vzeroupper)
32 movq %rdi, %rax
33 addq %rdx, %rax
34 jmp L(start)
35 END (__mempcpy_avx512_no_vzeroupper)
36
37 ENTRY (__memmove_chk_avx512_no_vzeroupper)
38 cmpq %rdx, %rcx
39 jb HIDDEN_JUMPTARGET (__chk_fail)
40 END (__memmove_chk_avx512_no_vzeroupper)
41
42 ENTRY (__memmove_avx512_no_vzeroupper)
43 mov %rdi, %rax
44 # ifdef USE_AS_MEMPCPY
45 add %rdx, %rax
46 # endif
47 L(start):
48 lea (%rsi, %rdx), %rcx
49 lea (%rdi, %rdx), %r9
50 cmp $512, %rdx
51 ja L(512bytesormore)
52
53 L(check):
54 cmp $16, %rdx
55 jbe L(less_16bytes)
56 cmp $256, %rdx
57 jb L(less_256bytes)
58 vmovups (%rsi), %zmm0
59 vmovups 0x40(%rsi), %zmm1
60 vmovups 0x80(%rsi), %zmm2
61 vmovups 0xC0(%rsi), %zmm3
62 vmovups -0x100(%rcx), %zmm4
63 vmovups -0xC0(%rcx), %zmm5
64 vmovups -0x80(%rcx), %zmm6
65 vmovups -0x40(%rcx), %zmm7
66 vmovups %zmm0, (%rdi)
67 vmovups %zmm1, 0x40(%rdi)
68 vmovups %zmm2, 0x80(%rdi)
69 vmovups %zmm3, 0xC0(%rdi)
70 vmovups %zmm4, -0x100(%r9)
71 vmovups %zmm5, -0xC0(%r9)
72 vmovups %zmm6, -0x80(%r9)
73 vmovups %zmm7, -0x40(%r9)
74 ret
75
76 L(less_256bytes):
77 cmp $128, %dl
78 jb L(less_128bytes)
79 vmovups (%rsi), %zmm0
80 vmovups 0x40(%rsi), %zmm1
81 vmovups -0x80(%rcx), %zmm2
82 vmovups -0x40(%rcx), %zmm3
83 vmovups %zmm0, (%rdi)
84 vmovups %zmm1, 0x40(%rdi)
85 vmovups %zmm2, -0x80(%r9)
86 vmovups %zmm3, -0x40(%r9)
87 ret
88
89 L(less_128bytes):
90 cmp $64, %dl
91 jb L(less_64bytes)
92 vmovdqu (%rsi), %ymm0
93 vmovdqu 0x20(%rsi), %ymm1
94 vmovdqu -0x40(%rcx), %ymm2
95 vmovdqu -0x20(%rcx), %ymm3
96 vmovdqu %ymm0, (%rdi)
97 vmovdqu %ymm1, 0x20(%rdi)
98 vmovdqu %ymm2, -0x40(%r9)
99 vmovdqu %ymm3, -0x20(%r9)
100 ret
101
102 L(less_64bytes):
103 cmp $32, %dl
104 jb L(less_32bytes)
105 vmovdqu (%rsi), %ymm0
106 vmovdqu -0x20(%rcx), %ymm1
107 vmovdqu %ymm0, (%rdi)
108 vmovdqu %ymm1, -0x20(%r9)
109 ret
110
111 L(less_32bytes):
112 vmovdqu (%rsi), %xmm0
113 vmovdqu -0x10(%rcx), %xmm1
114 vmovdqu %xmm0, (%rdi)
115 vmovdqu %xmm1, -0x10(%r9)
116 ret
117
118 L(less_16bytes):
119 cmp $8, %dl
120 jb L(less_8bytes)
121 movq (%rsi), %rsi
122 movq -0x8(%rcx), %rcx
123 movq %rsi, (%rdi)
124 movq %rcx, -0x8(%r9)
125 ret
126
127 L(less_8bytes):
128 cmp $4, %dl
129 jb L(less_4bytes)
130 mov (%rsi), %esi
131 mov -0x4(%rcx), %ecx
132 mov %esi, (%rdi)
133 mov %ecx, -0x4(%r9)
134 ret
135
136 L(less_4bytes):
137 cmp $2, %dl
138 jb L(less_2bytes)
139 mov (%rsi), %si
140 mov -0x2(%rcx), %cx
141 mov %si, (%rdi)
142 mov %cx, -0x2(%r9)
143 ret
144
145 L(less_2bytes):
146 cmp $1, %dl
147 jb L(less_1bytes)
148 mov (%rsi), %cl
149 mov %cl, (%rdi)
150 L(less_1bytes):
151 ret
152
153 L(512bytesormore):
154 # ifdef SHARED_CACHE_SIZE_HALF
155 mov $SHARED_CACHE_SIZE_HALF, %r8
156 # else
157 mov __x86_shared_cache_size_half(%rip), %r8
158 # endif
159 cmp %r8, %rdx
160 jae L(preloop_large)
161 cmp $1024, %rdx
162 ja L(1024bytesormore)
163 prefetcht1 (%rsi)
164 prefetcht1 0x40(%rsi)
165 prefetcht1 0x80(%rsi)
166 prefetcht1 0xC0(%rsi)
167 prefetcht1 0x100(%rsi)
168 prefetcht1 0x140(%rsi)
169 prefetcht1 0x180(%rsi)
170 prefetcht1 0x1C0(%rsi)
171 prefetcht1 -0x200(%rcx)
172 prefetcht1 -0x1C0(%rcx)
173 prefetcht1 -0x180(%rcx)
174 prefetcht1 -0x140(%rcx)
175 prefetcht1 -0x100(%rcx)
176 prefetcht1 -0xC0(%rcx)
177 prefetcht1 -0x80(%rcx)
178 prefetcht1 -0x40(%rcx)
179 vmovups (%rsi), %zmm0
180 vmovups 0x40(%rsi), %zmm1
181 vmovups 0x80(%rsi), %zmm2
182 vmovups 0xC0(%rsi), %zmm3
183 vmovups 0x100(%rsi), %zmm4
184 vmovups 0x140(%rsi), %zmm5
185 vmovups 0x180(%rsi), %zmm6
186 vmovups 0x1C0(%rsi), %zmm7
187 vmovups -0x200(%rcx), %zmm8
188 vmovups -0x1C0(%rcx), %zmm9
189 vmovups -0x180(%rcx), %zmm10
190 vmovups -0x140(%rcx), %zmm11
191 vmovups -0x100(%rcx), %zmm12
192 vmovups -0xC0(%rcx), %zmm13
193 vmovups -0x80(%rcx), %zmm14
194 vmovups -0x40(%rcx), %zmm15
195 vmovups %zmm0, (%rdi)
196 vmovups %zmm1, 0x40(%rdi)
197 vmovups %zmm2, 0x80(%rdi)
198 vmovups %zmm3, 0xC0(%rdi)
199 vmovups %zmm4, 0x100(%rdi)
200 vmovups %zmm5, 0x140(%rdi)
201 vmovups %zmm6, 0x180(%rdi)
202 vmovups %zmm7, 0x1C0(%rdi)
203 vmovups %zmm8, -0x200(%r9)
204 vmovups %zmm9, -0x1C0(%r9)
205 vmovups %zmm10, -0x180(%r9)
206 vmovups %zmm11, -0x140(%r9)
207 vmovups %zmm12, -0x100(%r9)
208 vmovups %zmm13, -0xC0(%r9)
209 vmovups %zmm14, -0x80(%r9)
210 vmovups %zmm15, -0x40(%r9)
211 ret
212
213 L(1024bytesormore):
214 cmp %rsi, %rdi
215 ja L(1024bytesormore_bkw)
216 sub $512, %r9
217 vmovups -0x200(%rcx), %zmm8
218 vmovups -0x1C0(%rcx), %zmm9
219 vmovups -0x180(%rcx), %zmm10
220 vmovups -0x140(%rcx), %zmm11
221 vmovups -0x100(%rcx), %zmm12
222 vmovups -0xC0(%rcx), %zmm13
223 vmovups -0x80(%rcx), %zmm14
224 vmovups -0x40(%rcx), %zmm15
225 prefetcht1 (%rsi)
226 prefetcht1 0x40(%rsi)
227 prefetcht1 0x80(%rsi)
228 prefetcht1 0xC0(%rsi)
229 prefetcht1 0x100(%rsi)
230 prefetcht1 0x140(%rsi)
231 prefetcht1 0x180(%rsi)
232 prefetcht1 0x1C0(%rsi)
233
234 /* Loop with unaligned memory access. */
235 L(gobble_512bytes_loop):
236 vmovups (%rsi), %zmm0
237 vmovups 0x40(%rsi), %zmm1
238 vmovups 0x80(%rsi), %zmm2
239 vmovups 0xC0(%rsi), %zmm3
240 vmovups 0x100(%rsi), %zmm4
241 vmovups 0x140(%rsi), %zmm5
242 vmovups 0x180(%rsi), %zmm6
243 vmovups 0x1C0(%rsi), %zmm7
244 add $512, %rsi
245 prefetcht1 (%rsi)
246 prefetcht1 0x40(%rsi)
247 prefetcht1 0x80(%rsi)
248 prefetcht1 0xC0(%rsi)
249 prefetcht1 0x100(%rsi)
250 prefetcht1 0x140(%rsi)
251 prefetcht1 0x180(%rsi)
252 prefetcht1 0x1C0(%rsi)
253 vmovups %zmm0, (%rdi)
254 vmovups %zmm1, 0x40(%rdi)
255 vmovups %zmm2, 0x80(%rdi)
256 vmovups %zmm3, 0xC0(%rdi)
257 vmovups %zmm4, 0x100(%rdi)
258 vmovups %zmm5, 0x140(%rdi)
259 vmovups %zmm6, 0x180(%rdi)
260 vmovups %zmm7, 0x1C0(%rdi)
261 add $512, %rdi
262 cmp %r9, %rdi
263 jb L(gobble_512bytes_loop)
264 vmovups %zmm8, (%r9)
265 vmovups %zmm9, 0x40(%r9)
266 vmovups %zmm10, 0x80(%r9)
267 vmovups %zmm11, 0xC0(%r9)
268 vmovups %zmm12, 0x100(%r9)
269 vmovups %zmm13, 0x140(%r9)
270 vmovups %zmm14, 0x180(%r9)
271 vmovups %zmm15, 0x1C0(%r9)
272 ret
273
274 L(1024bytesormore_bkw):
275 add $512, %rdi
276 vmovups 0x1C0(%rsi), %zmm8
277 vmovups 0x180(%rsi), %zmm9
278 vmovups 0x140(%rsi), %zmm10
279 vmovups 0x100(%rsi), %zmm11
280 vmovups 0xC0(%rsi), %zmm12
281 vmovups 0x80(%rsi), %zmm13
282 vmovups 0x40(%rsi), %zmm14
283 vmovups (%rsi), %zmm15
284 prefetcht1 -0x40(%rcx)
285 prefetcht1 -0x80(%rcx)
286 prefetcht1 -0xC0(%rcx)
287 prefetcht1 -0x100(%rcx)
288 prefetcht1 -0x140(%rcx)
289 prefetcht1 -0x180(%rcx)
290 prefetcht1 -0x1C0(%rcx)
291 prefetcht1 -0x200(%rcx)
292
293 /* Backward loop with unaligned memory access. */
294 L(gobble_512bytes_loop_bkw):
295 vmovups -0x40(%rcx), %zmm0
296 vmovups -0x80(%rcx), %zmm1
297 vmovups -0xC0(%rcx), %zmm2
298 vmovups -0x100(%rcx), %zmm3
299 vmovups -0x140(%rcx), %zmm4
300 vmovups -0x180(%rcx), %zmm5
301 vmovups -0x1C0(%rcx), %zmm6
302 vmovups -0x200(%rcx), %zmm7
303 sub $512, %rcx
304 prefetcht1 -0x40(%rcx)
305 prefetcht1 -0x80(%rcx)
306 prefetcht1 -0xC0(%rcx)
307 prefetcht1 -0x100(%rcx)
308 prefetcht1 -0x140(%rcx)
309 prefetcht1 -0x180(%rcx)
310 prefetcht1 -0x1C0(%rcx)
311 prefetcht1 -0x200(%rcx)
312 vmovups %zmm0, -0x40(%r9)
313 vmovups %zmm1, -0x80(%r9)
314 vmovups %zmm2, -0xC0(%r9)
315 vmovups %zmm3, -0x100(%r9)
316 vmovups %zmm4, -0x140(%r9)
317 vmovups %zmm5, -0x180(%r9)
318 vmovups %zmm6, -0x1C0(%r9)
319 vmovups %zmm7, -0x200(%r9)
320 sub $512, %r9
321 cmp %rdi, %r9
322 ja L(gobble_512bytes_loop_bkw)
323 vmovups %zmm8, -0x40(%rdi)
324 vmovups %zmm9, -0x80(%rdi)
325 vmovups %zmm10, -0xC0(%rdi)
326 vmovups %zmm11, -0x100(%rdi)
327 vmovups %zmm12, -0x140(%rdi)
328 vmovups %zmm13, -0x180(%rdi)
329 vmovups %zmm14, -0x1C0(%rdi)
330 vmovups %zmm15, -0x200(%rdi)
331 ret
332
333 L(preloop_large):
334 cmp %rsi, %rdi
335 ja L(preloop_large_bkw)
336 vmovups (%rsi), %zmm4
337 vmovups 0x40(%rsi), %zmm5
338
339 mov %rdi, %r11
340 /* Align destination for access with non-temporal stores in the loop. */
341 mov %rdi, %r8
342 and $-0x80, %rdi
343 add $0x80, %rdi
344 sub %rdi, %r8
345 sub %r8, %rsi
346 add %r8, %rdx
347 L(gobble_256bytes_nt_loop):
348 prefetcht1 0x200(%rsi)
349 prefetcht1 0x240(%rsi)
350 prefetcht1 0x280(%rsi)
351 prefetcht1 0x2C0(%rsi)
352 prefetcht1 0x300(%rsi)
353 prefetcht1 0x340(%rsi)
354 prefetcht1 0x380(%rsi)
355 prefetcht1 0x3C0(%rsi)
356 vmovdqu64 (%rsi), %zmm0
357 vmovdqu64 0x40(%rsi), %zmm1
358 vmovdqu64 0x80(%rsi), %zmm2
359 vmovdqu64 0xC0(%rsi), %zmm3
360 vmovntdq %zmm0, (%rdi)
361 vmovntdq %zmm1, 0x40(%rdi)
362 vmovntdq %zmm2, 0x80(%rdi)
363 vmovntdq %zmm3, 0xC0(%rdi)
364 sub $256, %rdx
365 add $256, %rsi
366 add $256, %rdi
367 cmp $256, %rdx
368 ja L(gobble_256bytes_nt_loop)
369 sfence
370 vmovups %zmm4, (%r11)
371 vmovups %zmm5, 0x40(%r11)
372 jmp L(check)
373
374 L(preloop_large_bkw):
375 vmovups -0x80(%rcx), %zmm4
376 vmovups -0x40(%rcx), %zmm5
377
378 /* Align end of destination for access with non-temporal stores. */
379 mov %r9, %r8
380 and $-0x80, %r9
381 sub %r9, %r8
382 sub %r8, %rcx
383 sub %r8, %rdx
384 add %r9, %r8
385 L(gobble_256bytes_nt_loop_bkw):
386 prefetcht1 -0x400(%rcx)
387 prefetcht1 -0x3C0(%rcx)
388 prefetcht1 -0x380(%rcx)
389 prefetcht1 -0x340(%rcx)
390 prefetcht1 -0x300(%rcx)
391 prefetcht1 -0x2C0(%rcx)
392 prefetcht1 -0x280(%rcx)
393 prefetcht1 -0x240(%rcx)
394 vmovdqu64 -0x100(%rcx), %zmm0
395 vmovdqu64 -0xC0(%rcx), %zmm1
396 vmovdqu64 -0x80(%rcx), %zmm2
397 vmovdqu64 -0x40(%rcx), %zmm3
398 vmovntdq %zmm0, -0x100(%r9)
399 vmovntdq %zmm1, -0xC0(%r9)
400 vmovntdq %zmm2, -0x80(%r9)
401 vmovntdq %zmm3, -0x40(%r9)
402 sub $256, %rdx
403 sub $256, %rcx
404 sub $256, %r9
405 cmp $256, %rdx
406 ja L(gobble_256bytes_nt_loop_bkw)
407 sfence
408 vmovups %zmm4, -0x80(%r8)
409 vmovups %zmm5, -0x40(%r8)
410 jmp L(check)
411 END (__memmove_avx512_no_vzeroupper)
412
413 strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
414 strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
415 #endif