]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-vec-unaligned-erms.S
CommitLineData
88b57b8e 1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2b778ceb 2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
88b57b8e
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
88b57b8e
L
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
a057f5f8 21 2. Load all sources into registers and store them together to avoid
3f61232a 22 possible address overlap between source and destination.
a057f5f8 23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
88b57b8e 24 and store them together.
a057f5f8
L
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
88b57b8e 36
a7d1c514 37#include <sysdep.h>
88b57b8e 38
a7d1c514
L
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
88b57b8e 42
a7d1c514
L
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
88b57b8e 56# endif
a7d1c514 57#endif
88b57b8e 58
a057f5f8
L
59#ifndef PREFETCH
60# define PREFETCH(addr) prefetcht0 addr
61#endif
62
63/* Assume 64-byte prefetch size. */
64#ifndef PREFETCH_SIZE
65# define PREFETCH_SIZE 64
66#endif
67
68#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
69
70#if PREFETCH_SIZE == 64
71# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
72# define PREFETCH_ONE_SET(dir, base, offset) \
73 PREFETCH ((offset)base)
74# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
75# define PREFETCH_ONE_SET(dir, base, offset) \
76 PREFETCH ((offset)base); \
77 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
78# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
79# define PREFETCH_ONE_SET(dir, base, offset) \
80 PREFETCH ((offset)base); \
81 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
a057f5f8
L
82 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
83 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
84# else
85# error Unsupported PREFETCHED_LOAD_SIZE!
86# endif
87#else
88# error Unsupported PREFETCH_SIZE!
89#endif
90
a7d1c514
L
91#ifndef SECTION
92# error SECTION is not defined!
93#endif
88b57b8e 94
a7d1c514
L
95 .section SECTION(.text),"ax",@progbits
96#if defined SHARED && IS_IN (libc)
c867597b 97ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
231c5676 98 cmp %RDX_LP, %RCX_LP
88b57b8e 99 jb HIDDEN_JUMPTARGET (__chk_fail)
c867597b 100END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
a7d1c514 101#endif
88b57b8e 102
c867597b 103ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
231c5676
L
104 mov %RDI_LP, %RAX_LP
105 add %RDX_LP, %RAX_LP
88b57b8e 106 jmp L(start)
c867597b 107END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
88b57b8e 108
a7d1c514 109#if defined SHARED && IS_IN (libc)
c867597b 110ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
231c5676 111 cmp %RDX_LP, %RCX_LP
88b57b8e 112 jb HIDDEN_JUMPTARGET (__chk_fail)
c867597b 113END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
a7d1c514 114#endif
88b57b8e 115
c867597b 116ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
88b57b8e
L
117 movq %rdi, %rax
118L(start):
231c5676
L
119# ifdef __ILP32__
120 /* Clear the upper 32 bits. */
121 movl %edx, %edx
122# endif
123 cmp $VEC_SIZE, %RDX_LP
88b57b8e 124 jb L(less_vec)
231c5676 125 cmp $(VEC_SIZE * 2), %RDX_LP
88b57b8e 126 ja L(more_2x_vec)
a7d1c514
L
127#if !defined USE_MULTIARCH || !IS_IN (libc)
128L(last_2x_vec):
129#endif
88b57b8e
L
130 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
131 VMOVU (%rsi), %VEC(0)
132 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
133 VMOVU %VEC(0), (%rdi)
134 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
135 VZEROUPPER
a7d1c514
L
136#if !defined USE_MULTIARCH || !IS_IN (libc)
137L(nop):
138#endif
88b57b8e 139 ret
a7d1c514 140#if defined USE_MULTIARCH && IS_IN (libc)
c867597b 141END (MEMMOVE_SYMBOL (__memmove, unaligned))
88b57b8e 142
13efa86e 143# if VEC_SIZE == 16
5c3e322d 144ENTRY (__mempcpy_chk_erms)
231c5676 145 cmp %RDX_LP, %RCX_LP
5c3e322d
L
146 jb HIDDEN_JUMPTARGET (__chk_fail)
147END (__mempcpy_chk_erms)
148
88b57b8e 149/* Only used to measure performance of REP MOVSB. */
88b57b8e 150ENTRY (__mempcpy_erms)
231c5676 151 mov %RDI_LP, %RAX_LP
727b38df 152 /* Skip zero length. */
231c5676 153 test %RDX_LP, %RDX_LP
727b38df 154 jz 2f
231c5676 155 add %RDX_LP, %RAX_LP
ea2785e9 156 jmp L(start_movsb)
88b57b8e 157END (__mempcpy_erms)
88b57b8e 158
5c3e322d 159ENTRY (__memmove_chk_erms)
231c5676 160 cmp %RDX_LP, %RCX_LP
5c3e322d
L
161 jb HIDDEN_JUMPTARGET (__chk_fail)
162END (__memmove_chk_erms)
163
88b57b8e
L
164ENTRY (__memmove_erms)
165 movq %rdi, %rax
727b38df 166 /* Skip zero length. */
231c5676 167 test %RDX_LP, %RDX_LP
727b38df 168 jz 2f
ea2785e9 169L(start_movsb):
231c5676
L
170 mov %RDX_LP, %RCX_LP
171 cmp %RSI_LP, %RDI_LP
ea2785e9
L
172 jb 1f
173 /* Source == destination is less common. */
174 je 2f
231c5676
L
175 lea (%rsi,%rcx), %RDX_LP
176 cmp %RDX_LP, %RDI_LP
88b57b8e
L
177 jb L(movsb_backward)
1781:
179 rep movsb
ea2785e9 1802:
88b57b8e
L
181 ret
182L(movsb_backward):
183 leaq -1(%rdi,%rcx), %rdi
184 leaq -1(%rsi,%rcx), %rsi
185 std
186 rep movsb
187 cld
188 ret
189END (__memmove_erms)
190strong_alias (__memmove_erms, __memcpy_erms)
5c3e322d 191strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
88b57b8e
L
192# endif
193
ea2785e9 194# ifdef SHARED
a7d1c514 195ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
231c5676 196 cmp %RDX_LP, %RCX_LP
ea2785e9 197 jb HIDDEN_JUMPTARGET (__chk_fail)
a7d1c514 198END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
fc11ff8d 199# endif
ea2785e9
L
200
201ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
231c5676
L
202 mov %RDI_LP, %RAX_LP
203 add %RDX_LP, %RAX_LP
ea2785e9
L
204 jmp L(start_erms)
205END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
206
fc11ff8d 207# ifdef SHARED
a7d1c514 208ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
231c5676 209 cmp %RDX_LP, %RCX_LP
ea2785e9 210 jb HIDDEN_JUMPTARGET (__chk_fail)
a7d1c514 211END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
ea2785e9
L
212# endif
213
88b57b8e
L
214ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
215 movq %rdi, %rax
216L(start_erms):
231c5676
L
217# ifdef __ILP32__
218 /* Clear the upper 32 bits. */
219 movl %edx, %edx
220# endif
221 cmp $VEC_SIZE, %RDX_LP
88b57b8e 222 jb L(less_vec)
231c5676 223 cmp $(VEC_SIZE * 2), %RDX_LP
88b57b8e
L
224 ja L(movsb_more_2x_vec)
225L(last_2x_vec):
226 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
227 VMOVU (%rsi), %VEC(0)
228 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
229 VMOVU %VEC(0), (%rdi)
230 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
231L(return):
232 VZEROUPPER
233 ret
234
235L(movsb):
55c7bcc7 236 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8 237 jae L(more_8x_vec)
88b57b8e 238 cmpq %rsi, %rdi
88b57b8e 239 jb 1f
ea2785e9
L
240 /* Source == destination is less common. */
241 je L(nop)
88b57b8e
L
242 leaq (%rsi,%rdx), %r9
243 cmpq %r9, %rdi
244 /* Avoid slow backward REP MOVSB. */
88b57b8e
L
245 jb L(more_8x_vec_backward)
2461:
231c5676 247 mov %RDX_LP, %RCX_LP
88b57b8e
L
248 rep movsb
249L(nop):
250 ret
a7d1c514 251#endif
a057f5f8 252
88b57b8e
L
253L(less_vec):
254 /* Less than 1 VEC. */
a7d1c514
L
255#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
256# error Unsupported VEC_SIZE!
257#endif
258#if VEC_SIZE > 32
88b57b8e
L
259 cmpb $32, %dl
260 jae L(between_32_63)
a7d1c514
L
261#endif
262#if VEC_SIZE > 16
88b57b8e
L
263 cmpb $16, %dl
264 jae L(between_16_31)
a7d1c514 265#endif
88b57b8e
L
266 cmpb $8, %dl
267 jae L(between_8_15)
268 cmpb $4, %dl
269 jae L(between_4_7)
270 cmpb $1, %dl
271 ja L(between_2_3)
272 jb 1f
273 movzbl (%rsi), %ecx
274 movb %cl, (%rdi)
2751:
276 ret
a7d1c514 277#if VEC_SIZE > 32
88b57b8e
L
278L(between_32_63):
279 /* From 32 to 63. No branch when size == 32. */
280 vmovdqu (%rsi), %ymm0
281 vmovdqu -32(%rsi,%rdx), %ymm1
282 vmovdqu %ymm0, (%rdi)
283 vmovdqu %ymm1, -32(%rdi,%rdx)
284 VZEROUPPER
285 ret
a7d1c514
L
286#endif
287#if VEC_SIZE > 16
88b57b8e
L
288 /* From 16 to 31. No branch when size == 16. */
289L(between_16_31):
290 vmovdqu (%rsi), %xmm0
291 vmovdqu -16(%rsi,%rdx), %xmm1
292 vmovdqu %xmm0, (%rdi)
293 vmovdqu %xmm1, -16(%rdi,%rdx)
294 ret
a7d1c514 295#endif
88b57b8e
L
296L(between_8_15):
297 /* From 8 to 15. No branch when size == 8. */
298 movq -8(%rsi,%rdx), %rcx
299 movq (%rsi), %rsi
300 movq %rcx, -8(%rdi,%rdx)
301 movq %rsi, (%rdi)
302 ret
303L(between_4_7):
304 /* From 4 to 7. No branch when size == 4. */
305 movl -4(%rsi,%rdx), %ecx
306 movl (%rsi), %esi
307 movl %ecx, -4(%rdi,%rdx)
308 movl %esi, (%rdi)
309 ret
310L(between_2_3):
311 /* From 2 to 3. No branch when size == 2. */
312 movzwl -2(%rsi,%rdx), %ecx
313 movzwl (%rsi), %esi
314 movw %cx, -2(%rdi,%rdx)
315 movw %si, (%rdi)
316 ret
317
a057f5f8
L
318#if defined USE_MULTIARCH && IS_IN (libc)
319L(movsb_more_2x_vec):
3f4b61a0 320 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
a057f5f8 321 ja L(movsb)
a7d1c514 322#endif
a057f5f8
L
323L(more_2x_vec):
324 /* More than 2 * VEC and there may be overlap between destination
88b57b8e
L
325 and source. */
326 cmpq $(VEC_SIZE * 8), %rdx
327 ja L(more_8x_vec)
328 cmpq $(VEC_SIZE * 4), %rdx
329 jb L(last_4x_vec)
88b57b8e
L
330 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
331 VMOVU (%rsi), %VEC(0)
332 VMOVU VEC_SIZE(%rsi), %VEC(1)
333 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
334 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
335 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
336 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
337 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
338 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
339 VMOVU %VEC(0), (%rdi)
340 VMOVU %VEC(1), VEC_SIZE(%rdi)
341 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
342 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
343 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
344 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
345 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
346 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
347 VZEROUPPER
348 ret
349L(last_4x_vec):
350 /* Copy from 2 * VEC to 4 * VEC. */
351 VMOVU (%rsi), %VEC(0)
352 VMOVU VEC_SIZE(%rsi), %VEC(1)
353 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
354 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
355 VMOVU %VEC(0), (%rdi)
356 VMOVU %VEC(1), VEC_SIZE(%rdi)
357 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
358 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
359 VZEROUPPER
360 ret
a057f5f8 361
88b57b8e
L
362L(more_8x_vec):
363 cmpq %rsi, %rdi
364 ja L(more_8x_vec_backward)
a057f5f8
L
365 /* Source == destination is less common. */
366 je L(nop)
367 /* Load the first VEC and last 4 * VEC to support overlapping
368 addresses. */
369 VMOVU (%rsi), %VEC(4)
370 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
371 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
372 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
373 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
374 /* Save start and stop of the destination buffer. */
375 movq %rdi, %r11
376 leaq -VEC_SIZE(%rdi, %rdx), %rcx
377 /* Align destination for aligned stores in the loop. Compute
378 how much destination is misaligned. */
379 movq %rdi, %r8
380 andq $(VEC_SIZE - 1), %r8
381 /* Get the negative of offset for alignment. */
382 subq $VEC_SIZE, %r8
383 /* Adjust source. */
384 subq %r8, %rsi
385 /* Adjust destination which should be aligned now. */
386 subq %r8, %rdi
387 /* Adjust length. */
388 addq %r8, %rdx
389#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
390 /* Check non-temporal store threshold. */
55c7bcc7 391 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8
L
392 ja L(large_forward)
393#endif
394L(loop_4x_vec_forward):
395 /* Copy 4 * VEC a time forward. */
88b57b8e
L
396 VMOVU (%rsi), %VEC(0)
397 VMOVU VEC_SIZE(%rsi), %VEC(1)
398 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
399 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
a057f5f8
L
400 addq $(VEC_SIZE * 4), %rsi
401 subq $(VEC_SIZE * 4), %rdx
402 VMOVA %VEC(0), (%rdi)
403 VMOVA %VEC(1), VEC_SIZE(%rdi)
404 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
405 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
406 addq $(VEC_SIZE * 4), %rdi
88b57b8e 407 cmpq $(VEC_SIZE * 4), %rdx
a057f5f8
L
408 ja L(loop_4x_vec_forward)
409 /* Store the last 4 * VEC. */
410 VMOVU %VEC(5), (%rcx)
411 VMOVU %VEC(6), -VEC_SIZE(%rcx)
412 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
413 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
414 /* Store the first VEC. */
415 VMOVU %VEC(4), (%r11)
416 VZEROUPPER
417 ret
88b57b8e 418
88b57b8e 419L(more_8x_vec_backward):
a057f5f8
L
420 /* Load the first 4 * VEC and last VEC to support overlapping
421 addresses. */
422 VMOVU (%rsi), %VEC(4)
423 VMOVU VEC_SIZE(%rsi), %VEC(5)
424 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
425 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
426 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
427 /* Save stop of the destination buffer. */
428 leaq -VEC_SIZE(%rdi, %rdx), %r11
429 /* Align destination end for aligned stores in the loop. Compute
430 how much destination end is misaligned. */
88b57b8e 431 leaq -VEC_SIZE(%rsi, %rdx), %rcx
a057f5f8
L
432 movq %r11, %r9
433 movq %r11, %r8
434 andq $(VEC_SIZE - 1), %r8
435 /* Adjust source. */
436 subq %r8, %rcx
437 /* Adjust the end of destination which should be aligned now. */
438 subq %r8, %r9
439 /* Adjust length. */
440 subq %r8, %rdx
441#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
442 /* Check non-temporal store threshold. */
55c7bcc7 443 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8
L
444 ja L(large_backward)
445#endif
446L(loop_4x_vec_backward):
447 /* Copy 4 * VEC a time backward. */
88b57b8e
L
448 VMOVU (%rcx), %VEC(0)
449 VMOVU -VEC_SIZE(%rcx), %VEC(1)
450 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
451 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
a057f5f8
L
452 subq $(VEC_SIZE * 4), %rcx
453 subq $(VEC_SIZE * 4), %rdx
454 VMOVA %VEC(0), (%r9)
455 VMOVA %VEC(1), -VEC_SIZE(%r9)
456 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
457 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
458 subq $(VEC_SIZE * 4), %r9
88b57b8e 459 cmpq $(VEC_SIZE * 4), %rdx
a057f5f8
L
460 ja L(loop_4x_vec_backward)
461 /* Store the first 4 * VEC. */
462 VMOVU %VEC(4), (%rdi)
463 VMOVU %VEC(5), VEC_SIZE(%rdi)
464 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
465 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
466 /* Store the last VEC. */
467 VMOVU %VEC(8), (%r11)
468 VZEROUPPER
469 ret
470
471#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
472L(large_forward):
473 /* Don't use non-temporal store if there is overlap between
474 destination and source since destination may be in cache
475 when source is loaded. */
476 leaq (%rdi, %rdx), %r10
477 cmpq %r10, %rsi
478 jb L(loop_4x_vec_forward)
479L(loop_large_forward):
480 /* Copy 4 * VEC a time forward with non-temporal stores. */
481 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
482 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
483 VMOVU (%rsi), %VEC(0)
484 VMOVU VEC_SIZE(%rsi), %VEC(1)
485 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
486 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
487 addq $PREFETCHED_LOAD_SIZE, %rsi
488 subq $PREFETCHED_LOAD_SIZE, %rdx
489 VMOVNT %VEC(0), (%rdi)
490 VMOVNT %VEC(1), VEC_SIZE(%rdi)
491 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
492 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
493 addq $PREFETCHED_LOAD_SIZE, %rdi
494 cmpq $PREFETCHED_LOAD_SIZE, %rdx
495 ja L(loop_large_forward)
496 sfence
497 /* Store the last 4 * VEC. */
498 VMOVU %VEC(5), (%rcx)
499 VMOVU %VEC(6), -VEC_SIZE(%rcx)
500 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
501 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
502 /* Store the first VEC. */
503 VMOVU %VEC(4), (%r11)
504 VZEROUPPER
505 ret
506
507L(large_backward):
508 /* Don't use non-temporal store if there is overlap between
509 destination and source since destination may be in cache
510 when source is loaded. */
511 leaq (%rcx, %rdx), %r10
512 cmpq %r10, %r9
513 jb L(loop_4x_vec_backward)
514L(loop_large_backward):
515 /* Copy 4 * VEC a time backward with non-temporal stores. */
516 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
517 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
518 VMOVU (%rcx), %VEC(0)
519 VMOVU -VEC_SIZE(%rcx), %VEC(1)
520 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
521 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
522 subq $PREFETCHED_LOAD_SIZE, %rcx
523 subq $PREFETCHED_LOAD_SIZE, %rdx
524 VMOVNT %VEC(0), (%r9)
525 VMOVNT %VEC(1), -VEC_SIZE(%r9)
526 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
527 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
528 subq $PREFETCHED_LOAD_SIZE, %r9
529 cmpq $PREFETCHED_LOAD_SIZE, %rdx
530 ja L(loop_large_backward)
531 sfence
532 /* Store the first 4 * VEC. */
533 VMOVU %VEC(4), (%rdi)
534 VMOVU %VEC(5), VEC_SIZE(%rdi)
535 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
536 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
537 /* Store the last VEC. */
538 VMOVU %VEC(8), (%r11)
539 VZEROUPPER
540 ret
541#endif
88b57b8e
L
542END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
543
fc11ff8d
L
544#if IS_IN (libc)
545# ifdef USE_MULTIARCH
88b57b8e
L
546strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
547 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
fc11ff8d 548# ifdef SHARED
88b57b8e
L
549strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
550 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
a7d1c514 551# endif
fc11ff8d
L
552# endif
553# ifdef SHARED
c867597b
L
554strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
88b57b8e 556# endif
a7d1c514 557#endif
c867597b
L
558strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
559 MEMCPY_SYMBOL (__memcpy, unaligned))