]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-vec-unaligned-erms.S
1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37 #include <sysdep.h>
38
39 #ifndef MEMCPY_SYMBOL
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41 #endif
42
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45 #endif
46
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49 #endif
50
51 #ifndef VZEROUPPER
52 # if VEC_SIZE > 16
53 # define VZEROUPPER vzeroupper
54 # else
55 # define VZEROUPPER
56 # endif
57 #endif
58
59 #ifndef PREFETCH
60 # define PREFETCH(addr) prefetcht0 addr
61 #endif
62
63 /* Assume 64-byte prefetch size. */
64 #ifndef PREFETCH_SIZE
65 # define PREFETCH_SIZE 64
66 #endif
67
68 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
69
70 #if PREFETCH_SIZE == 64
71 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
72 # define PREFETCH_ONE_SET(dir, base, offset) \
73 PREFETCH ((offset)base)
74 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
75 # define PREFETCH_ONE_SET(dir, base, offset) \
76 PREFETCH ((offset)base); \
77 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
78 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
79 # define PREFETCH_ONE_SET(dir, base, offset) \
80 PREFETCH ((offset)base); \
81 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
82 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
83 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
84 # else
85 # error Unsupported PREFETCHED_LOAD_SIZE!
86 # endif
87 #else
88 # error Unsupported PREFETCH_SIZE!
89 #endif
90
91 #ifndef SECTION
92 # error SECTION is not defined!
93 #endif
94
95 .section SECTION(.text),"ax",@progbits
96 #if defined SHARED && IS_IN (libc)
97 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
98 cmp %RDX_LP, %RCX_LP
99 jb HIDDEN_JUMPTARGET (__chk_fail)
100 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
101 #endif
102
103 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
104 mov %RDI_LP, %RAX_LP
105 add %RDX_LP, %RAX_LP
106 jmp L(start)
107 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
108
109 #if defined SHARED && IS_IN (libc)
110 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
111 cmp %RDX_LP, %RCX_LP
112 jb HIDDEN_JUMPTARGET (__chk_fail)
113 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
114 #endif
115
116 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
117 movq %rdi, %rax
118 L(start):
119 # ifdef __ILP32__
120 /* Clear the upper 32 bits. */
121 movl %edx, %edx
122 # endif
123 cmp $VEC_SIZE, %RDX_LP
124 jb L(less_vec)
125 cmp $(VEC_SIZE * 2), %RDX_LP
126 ja L(more_2x_vec)
127 #if !defined USE_MULTIARCH || !IS_IN (libc)
128 L(last_2x_vec):
129 #endif
130 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
131 VMOVU (%rsi), %VEC(0)
132 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
133 VMOVU %VEC(0), (%rdi)
134 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
135 VZEROUPPER
136 #if !defined USE_MULTIARCH || !IS_IN (libc)
137 L(nop):
138 #endif
139 ret
140 #if defined USE_MULTIARCH && IS_IN (libc)
141 END (MEMMOVE_SYMBOL (__memmove, unaligned))
142
143 # if VEC_SIZE == 16
144 ENTRY (__mempcpy_chk_erms)
145 cmp %RDX_LP, %RCX_LP
146 jb HIDDEN_JUMPTARGET (__chk_fail)
147 END (__mempcpy_chk_erms)
148
149 /* Only used to measure performance of REP MOVSB. */
150 ENTRY (__mempcpy_erms)
151 mov %RDI_LP, %RAX_LP
152 /* Skip zero length. */
153 test %RDX_LP, %RDX_LP
154 jz 2f
155 add %RDX_LP, %RAX_LP
156 jmp L(start_movsb)
157 END (__mempcpy_erms)
158
159 ENTRY (__memmove_chk_erms)
160 cmp %RDX_LP, %RCX_LP
161 jb HIDDEN_JUMPTARGET (__chk_fail)
162 END (__memmove_chk_erms)
163
164 ENTRY (__memmove_erms)
165 movq %rdi, %rax
166 /* Skip zero length. */
167 test %RDX_LP, %RDX_LP
168 jz 2f
169 L(start_movsb):
170 mov %RDX_LP, %RCX_LP
171 cmp %RSI_LP, %RDI_LP
172 jb 1f
173 /* Source == destination is less common. */
174 je 2f
175 lea (%rsi,%rcx), %RDX_LP
176 cmp %RDX_LP, %RDI_LP
177 jb L(movsb_backward)
178 1:
179 rep movsb
180 2:
181 ret
182 L(movsb_backward):
183 leaq -1(%rdi,%rcx), %rdi
184 leaq -1(%rsi,%rcx), %rsi
185 std
186 rep movsb
187 cld
188 ret
189 END (__memmove_erms)
190 strong_alias (__memmove_erms, __memcpy_erms)
191 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
192 # endif
193
194 # ifdef SHARED
195 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
196 cmp %RDX_LP, %RCX_LP
197 jb HIDDEN_JUMPTARGET (__chk_fail)
198 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
199 # endif
200
201 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202 mov %RDI_LP, %RAX_LP
203 add %RDX_LP, %RAX_LP
204 jmp L(start_erms)
205 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
206
207 # ifdef SHARED
208 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
209 cmp %RDX_LP, %RCX_LP
210 jb HIDDEN_JUMPTARGET (__chk_fail)
211 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
212 # endif
213
214 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
215 movq %rdi, %rax
216 L(start_erms):
217 # ifdef __ILP32__
218 /* Clear the upper 32 bits. */
219 movl %edx, %edx
220 # endif
221 cmp $VEC_SIZE, %RDX_LP
222 jb L(less_vec)
223 cmp $(VEC_SIZE * 2), %RDX_LP
224 ja L(movsb_more_2x_vec)
225 L(last_2x_vec):
226 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
227 VMOVU (%rsi), %VEC(0)
228 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
229 VMOVU %VEC(0), (%rdi)
230 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
231 L(return):
232 VZEROUPPER
233 ret
234
235 L(movsb):
236 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
237 jae L(more_8x_vec)
238 cmpq %rsi, %rdi
239 jb 1f
240 /* Source == destination is less common. */
241 je L(nop)
242 leaq (%rsi,%rdx), %r9
243 cmpq %r9, %rdi
244 /* Avoid slow backward REP MOVSB. */
245 jb L(more_8x_vec_backward)
246 1:
247 mov %RDX_LP, %RCX_LP
248 rep movsb
249 L(nop):
250 ret
251 #endif
252
253 L(less_vec):
254 /* Less than 1 VEC. */
255 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
256 # error Unsupported VEC_SIZE!
257 #endif
258 #if VEC_SIZE > 32
259 cmpb $32, %dl
260 jae L(between_32_63)
261 #endif
262 #if VEC_SIZE > 16
263 cmpb $16, %dl
264 jae L(between_16_31)
265 #endif
266 cmpb $8, %dl
267 jae L(between_8_15)
268 cmpb $4, %dl
269 jae L(between_4_7)
270 cmpb $1, %dl
271 ja L(between_2_3)
272 jb 1f
273 movzbl (%rsi), %ecx
274 movb %cl, (%rdi)
275 1:
276 ret
277 #if VEC_SIZE > 32
278 L(between_32_63):
279 /* From 32 to 63. No branch when size == 32. */
280 vmovdqu (%rsi), %ymm0
281 vmovdqu -32(%rsi,%rdx), %ymm1
282 vmovdqu %ymm0, (%rdi)
283 vmovdqu %ymm1, -32(%rdi,%rdx)
284 VZEROUPPER
285 ret
286 #endif
287 #if VEC_SIZE > 16
288 /* From 16 to 31. No branch when size == 16. */
289 L(between_16_31):
290 vmovdqu (%rsi), %xmm0
291 vmovdqu -16(%rsi,%rdx), %xmm1
292 vmovdqu %xmm0, (%rdi)
293 vmovdqu %xmm1, -16(%rdi,%rdx)
294 ret
295 #endif
296 L(between_8_15):
297 /* From 8 to 15. No branch when size == 8. */
298 movq -8(%rsi,%rdx), %rcx
299 movq (%rsi), %rsi
300 movq %rcx, -8(%rdi,%rdx)
301 movq %rsi, (%rdi)
302 ret
303 L(between_4_7):
304 /* From 4 to 7. No branch when size == 4. */
305 movl -4(%rsi,%rdx), %ecx
306 movl (%rsi), %esi
307 movl %ecx, -4(%rdi,%rdx)
308 movl %esi, (%rdi)
309 ret
310 L(between_2_3):
311 /* From 2 to 3. No branch when size == 2. */
312 movzwl -2(%rsi,%rdx), %ecx
313 movzwl (%rsi), %esi
314 movw %cx, -2(%rdi,%rdx)
315 movw %si, (%rdi)
316 ret
317
318 #if defined USE_MULTIARCH && IS_IN (libc)
319 L(movsb_more_2x_vec):
320 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
321 ja L(movsb)
322 #endif
323 L(more_2x_vec):
324 /* More than 2 * VEC and there may be overlap between destination
325 and source. */
326 cmpq $(VEC_SIZE * 8), %rdx
327 ja L(more_8x_vec)
328 cmpq $(VEC_SIZE * 4), %rdx
329 jb L(last_4x_vec)
330 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
331 VMOVU (%rsi), %VEC(0)
332 VMOVU VEC_SIZE(%rsi), %VEC(1)
333 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
334 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
335 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
336 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
337 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
338 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
339 VMOVU %VEC(0), (%rdi)
340 VMOVU %VEC(1), VEC_SIZE(%rdi)
341 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
342 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
343 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
344 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
345 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
346 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
347 VZEROUPPER
348 ret
349 L(last_4x_vec):
350 /* Copy from 2 * VEC to 4 * VEC. */
351 VMOVU (%rsi), %VEC(0)
352 VMOVU VEC_SIZE(%rsi), %VEC(1)
353 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
354 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
355 VMOVU %VEC(0), (%rdi)
356 VMOVU %VEC(1), VEC_SIZE(%rdi)
357 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
358 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
359 VZEROUPPER
360 ret
361
362 L(more_8x_vec):
363 cmpq %rsi, %rdi
364 ja L(more_8x_vec_backward)
365 /* Source == destination is less common. */
366 je L(nop)
367 /* Load the first VEC and last 4 * VEC to support overlapping
368 addresses. */
369 VMOVU (%rsi), %VEC(4)
370 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
371 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
372 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
373 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
374 /* Save start and stop of the destination buffer. */
375 movq %rdi, %r11
376 leaq -VEC_SIZE(%rdi, %rdx), %rcx
377 /* Align destination for aligned stores in the loop. Compute
378 how much destination is misaligned. */
379 movq %rdi, %r8
380 andq $(VEC_SIZE - 1), %r8
381 /* Get the negative of offset for alignment. */
382 subq $VEC_SIZE, %r8
383 /* Adjust source. */
384 subq %r8, %rsi
385 /* Adjust destination which should be aligned now. */
386 subq %r8, %rdi
387 /* Adjust length. */
388 addq %r8, %rdx
389 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
390 /* Check non-temporal store threshold. */
391 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
392 ja L(large_forward)
393 #endif
394 L(loop_4x_vec_forward):
395 /* Copy 4 * VEC a time forward. */
396 VMOVU (%rsi), %VEC(0)
397 VMOVU VEC_SIZE(%rsi), %VEC(1)
398 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
399 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
400 addq $(VEC_SIZE * 4), %rsi
401 subq $(VEC_SIZE * 4), %rdx
402 VMOVA %VEC(0), (%rdi)
403 VMOVA %VEC(1), VEC_SIZE(%rdi)
404 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
405 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
406 addq $(VEC_SIZE * 4), %rdi
407 cmpq $(VEC_SIZE * 4), %rdx
408 ja L(loop_4x_vec_forward)
409 /* Store the last 4 * VEC. */
410 VMOVU %VEC(5), (%rcx)
411 VMOVU %VEC(6), -VEC_SIZE(%rcx)
412 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
413 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
414 /* Store the first VEC. */
415 VMOVU %VEC(4), (%r11)
416 VZEROUPPER
417 ret
418
419 L(more_8x_vec_backward):
420 /* Load the first 4 * VEC and last VEC to support overlapping
421 addresses. */
422 VMOVU (%rsi), %VEC(4)
423 VMOVU VEC_SIZE(%rsi), %VEC(5)
424 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
425 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
426 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
427 /* Save stop of the destination buffer. */
428 leaq -VEC_SIZE(%rdi, %rdx), %r11
429 /* Align destination end for aligned stores in the loop. Compute
430 how much destination end is misaligned. */
431 leaq -VEC_SIZE(%rsi, %rdx), %rcx
432 movq %r11, %r9
433 movq %r11, %r8
434 andq $(VEC_SIZE - 1), %r8
435 /* Adjust source. */
436 subq %r8, %rcx
437 /* Adjust the end of destination which should be aligned now. */
438 subq %r8, %r9
439 /* Adjust length. */
440 subq %r8, %rdx
441 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
442 /* Check non-temporal store threshold. */
443 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
444 ja L(large_backward)
445 #endif
446 L(loop_4x_vec_backward):
447 /* Copy 4 * VEC a time backward. */
448 VMOVU (%rcx), %VEC(0)
449 VMOVU -VEC_SIZE(%rcx), %VEC(1)
450 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
451 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
452 subq $(VEC_SIZE * 4), %rcx
453 subq $(VEC_SIZE * 4), %rdx
454 VMOVA %VEC(0), (%r9)
455 VMOVA %VEC(1), -VEC_SIZE(%r9)
456 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
457 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
458 subq $(VEC_SIZE * 4), %r9
459 cmpq $(VEC_SIZE * 4), %rdx
460 ja L(loop_4x_vec_backward)
461 /* Store the first 4 * VEC. */
462 VMOVU %VEC(4), (%rdi)
463 VMOVU %VEC(5), VEC_SIZE(%rdi)
464 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
465 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
466 /* Store the last VEC. */
467 VMOVU %VEC(8), (%r11)
468 VZEROUPPER
469 ret
470
471 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
472 L(large_forward):
473 /* Don't use non-temporal store if there is overlap between
474 destination and source since destination may be in cache
475 when source is loaded. */
476 leaq (%rdi, %rdx), %r10
477 cmpq %r10, %rsi
478 jb L(loop_4x_vec_forward)
479 L(loop_large_forward):
480 /* Copy 4 * VEC a time forward with non-temporal stores. */
481 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
482 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
483 VMOVU (%rsi), %VEC(0)
484 VMOVU VEC_SIZE(%rsi), %VEC(1)
485 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
486 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
487 addq $PREFETCHED_LOAD_SIZE, %rsi
488 subq $PREFETCHED_LOAD_SIZE, %rdx
489 VMOVNT %VEC(0), (%rdi)
490 VMOVNT %VEC(1), VEC_SIZE(%rdi)
491 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
492 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
493 addq $PREFETCHED_LOAD_SIZE, %rdi
494 cmpq $PREFETCHED_LOAD_SIZE, %rdx
495 ja L(loop_large_forward)
496 sfence
497 /* Store the last 4 * VEC. */
498 VMOVU %VEC(5), (%rcx)
499 VMOVU %VEC(6), -VEC_SIZE(%rcx)
500 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
501 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
502 /* Store the first VEC. */
503 VMOVU %VEC(4), (%r11)
504 VZEROUPPER
505 ret
506
507 L(large_backward):
508 /* Don't use non-temporal store if there is overlap between
509 destination and source since destination may be in cache
510 when source is loaded. */
511 leaq (%rcx, %rdx), %r10
512 cmpq %r10, %r9
513 jb L(loop_4x_vec_backward)
514 L(loop_large_backward):
515 /* Copy 4 * VEC a time backward with non-temporal stores. */
516 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
517 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
518 VMOVU (%rcx), %VEC(0)
519 VMOVU -VEC_SIZE(%rcx), %VEC(1)
520 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
521 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
522 subq $PREFETCHED_LOAD_SIZE, %rcx
523 subq $PREFETCHED_LOAD_SIZE, %rdx
524 VMOVNT %VEC(0), (%r9)
525 VMOVNT %VEC(1), -VEC_SIZE(%r9)
526 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
527 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
528 subq $PREFETCHED_LOAD_SIZE, %r9
529 cmpq $PREFETCHED_LOAD_SIZE, %rdx
530 ja L(loop_large_backward)
531 sfence
532 /* Store the first 4 * VEC. */
533 VMOVU %VEC(4), (%rdi)
534 VMOVU %VEC(5), VEC_SIZE(%rdi)
535 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
536 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
537 /* Store the last VEC. */
538 VMOVU %VEC(8), (%r11)
539 VZEROUPPER
540 ret
541 #endif
542 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
543
544 #if IS_IN (libc)
545 # ifdef USE_MULTIARCH
546 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
547 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
548 # ifdef SHARED
549 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
550 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
551 # endif
552 # endif
553 # ifdef SHARED
554 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
556 # endif
557 #endif
558 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
559 MEMCPY_SYMBOL (__memcpy, unaligned))