]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-vec-unaligned-erms.S
1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37 #include <sysdep.h>
38
39 #ifndef MEMCPY_SYMBOL
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41 #endif
42
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45 #endif
46
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49 #endif
50
51 #ifndef VZEROUPPER
52 # if VEC_SIZE > 16
53 # define VZEROUPPER vzeroupper
54 # else
55 # define VZEROUPPER
56 # endif
57 #endif
58
59 /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66 #ifndef REP_MOVSB_THRESHOLD
67 # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68 #endif
69
70 #ifndef PREFETCH
71 # define PREFETCH(addr) prefetcht0 addr
72 #endif
73
74 /* Assume 64-byte prefetch size. */
75 #ifndef PREFETCH_SIZE
76 # define PREFETCH_SIZE 64
77 #endif
78
79 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81 #if PREFETCH_SIZE == 64
82 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83 # define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86 # define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90 # define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96 # else
97 # error Unsupported PREFETCHED_LOAD_SIZE!
98 # endif
99 #else
100 # error Unsupported PREFETCH_SIZE!
101 #endif
102
103 #ifndef SECTION
104 # error SECTION is not defined!
105 #endif
106
107 .section SECTION(.text),"ax",@progbits
108 #if defined SHARED && IS_IN (libc)
109 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 cmpq %rdx, %rcx
111 jb HIDDEN_JUMPTARGET (__chk_fail)
112 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113 #endif
114
115 #if VEC_SIZE == 16 || defined SHARED
116 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117 movq %rdi, %rax
118 addq %rdx, %rax
119 jmp L(start)
120 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121 #endif
122
123 #if defined SHARED && IS_IN (libc)
124 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125 cmpq %rdx, %rcx
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128 #endif
129
130 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131 movq %rdi, %rax
132 L(start):
133 cmpq $VEC_SIZE, %rdx
134 jb L(less_vec)
135 cmpq $(VEC_SIZE * 2), %rdx
136 ja L(more_2x_vec)
137 #if !defined USE_MULTIARCH || !IS_IN (libc)
138 L(last_2x_vec):
139 #endif
140 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
141 VMOVU (%rsi), %VEC(0)
142 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
143 VMOVU %VEC(0), (%rdi)
144 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
145 VZEROUPPER
146 #if !defined USE_MULTIARCH || !IS_IN (libc)
147 L(nop):
148 #endif
149 ret
150 #if defined USE_MULTIARCH && IS_IN (libc)
151 END (MEMMOVE_SYMBOL (__memmove, unaligned))
152
153 # if VEC_SIZE == 16
154 # if defined SHARED
155 /* Only used to measure performance of REP MOVSB. */
156 ENTRY (__mempcpy_erms)
157 movq %rdi, %rax
158 addq %rdx, %rax
159 jmp L(start_movsb)
160 END (__mempcpy_erms)
161 # endif
162
163 ENTRY (__memmove_erms)
164 movq %rdi, %rax
165 L(start_movsb):
166 movq %rdx, %rcx
167 cmpq %rsi, %rdi
168 jb 1f
169 /* Source == destination is less common. */
170 je 2f
171 leaq (%rsi,%rcx), %rdx
172 cmpq %rdx, %rdi
173 jb L(movsb_backward)
174 1:
175 rep movsb
176 2:
177 ret
178 L(movsb_backward):
179 leaq -1(%rdi,%rcx), %rdi
180 leaq -1(%rsi,%rcx), %rsi
181 std
182 rep movsb
183 cld
184 ret
185 END (__memmove_erms)
186 # if defined SHARED
187 strong_alias (__memmove_erms, __memcpy_erms)
188 # endif
189 # endif
190
191 # ifdef SHARED
192 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
193 cmpq %rdx, %rcx
194 jb HIDDEN_JUMPTARGET (__chk_fail)
195 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
196
197 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
198 movq %rdi, %rax
199 addq %rdx, %rax
200 jmp L(start_erms)
201 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202
203 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
204 cmpq %rdx, %rcx
205 jb HIDDEN_JUMPTARGET (__chk_fail)
206 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
207 # endif
208
209 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
210 movq %rdi, %rax
211 L(start_erms):
212 cmpq $VEC_SIZE, %rdx
213 jb L(less_vec)
214 cmpq $(VEC_SIZE * 2), %rdx
215 ja L(movsb_more_2x_vec)
216 L(last_2x_vec):
217 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
218 VMOVU (%rsi), %VEC(0)
219 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
220 VMOVU %VEC(0), (%rdi)
221 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
222 L(return):
223 VZEROUPPER
224 ret
225
226 L(movsb):
227 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
228 jae L(more_8x_vec)
229 cmpq %rsi, %rdi
230 jb 1f
231 /* Source == destination is less common. */
232 je L(nop)
233 leaq (%rsi,%rdx), %r9
234 cmpq %r9, %rdi
235 /* Avoid slow backward REP MOVSB. */
236 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
237 # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
238 # endif
239 jb L(more_8x_vec_backward)
240 1:
241 movq %rdx, %rcx
242 rep movsb
243 L(nop):
244 ret
245 #endif
246
247 L(less_vec):
248 /* Less than 1 VEC. */
249 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
250 # error Unsupported VEC_SIZE!
251 #endif
252 #if VEC_SIZE > 32
253 cmpb $32, %dl
254 jae L(between_32_63)
255 #endif
256 #if VEC_SIZE > 16
257 cmpb $16, %dl
258 jae L(between_16_31)
259 #endif
260 cmpb $8, %dl
261 jae L(between_8_15)
262 cmpb $4, %dl
263 jae L(between_4_7)
264 cmpb $1, %dl
265 ja L(between_2_3)
266 jb 1f
267 movzbl (%rsi), %ecx
268 movb %cl, (%rdi)
269 1:
270 ret
271 #if VEC_SIZE > 32
272 L(between_32_63):
273 /* From 32 to 63. No branch when size == 32. */
274 vmovdqu (%rsi), %ymm0
275 vmovdqu -32(%rsi,%rdx), %ymm1
276 vmovdqu %ymm0, (%rdi)
277 vmovdqu %ymm1, -32(%rdi,%rdx)
278 VZEROUPPER
279 ret
280 #endif
281 #if VEC_SIZE > 16
282 /* From 16 to 31. No branch when size == 16. */
283 L(between_16_31):
284 vmovdqu (%rsi), %xmm0
285 vmovdqu -16(%rsi,%rdx), %xmm1
286 vmovdqu %xmm0, (%rdi)
287 vmovdqu %xmm1, -16(%rdi,%rdx)
288 ret
289 #endif
290 L(between_8_15):
291 /* From 8 to 15. No branch when size == 8. */
292 movq -8(%rsi,%rdx), %rcx
293 movq (%rsi), %rsi
294 movq %rcx, -8(%rdi,%rdx)
295 movq %rsi, (%rdi)
296 ret
297 L(between_4_7):
298 /* From 4 to 7. No branch when size == 4. */
299 movl -4(%rsi,%rdx), %ecx
300 movl (%rsi), %esi
301 movl %ecx, -4(%rdi,%rdx)
302 movl %esi, (%rdi)
303 ret
304 L(between_2_3):
305 /* From 2 to 3. No branch when size == 2. */
306 movzwl -2(%rsi,%rdx), %ecx
307 movzwl (%rsi), %esi
308 movw %cx, -2(%rdi,%rdx)
309 movw %si, (%rdi)
310 ret
311
312 #if defined USE_MULTIARCH && IS_IN (libc)
313 L(movsb_more_2x_vec):
314 cmpq $REP_MOVSB_THRESHOLD, %rdx
315 ja L(movsb)
316 #endif
317 L(more_2x_vec):
318 /* More than 2 * VEC and there may be overlap between destination
319 and source. */
320 cmpq $(VEC_SIZE * 8), %rdx
321 ja L(more_8x_vec)
322 cmpq $(VEC_SIZE * 4), %rdx
323 jb L(last_4x_vec)
324 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
325 VMOVU (%rsi), %VEC(0)
326 VMOVU VEC_SIZE(%rsi), %VEC(1)
327 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
328 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
329 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
330 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
331 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
332 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
333 VMOVU %VEC(0), (%rdi)
334 VMOVU %VEC(1), VEC_SIZE(%rdi)
335 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
336 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
337 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
338 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
339 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
340 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
341 VZEROUPPER
342 ret
343 L(last_4x_vec):
344 /* Copy from 2 * VEC to 4 * VEC. */
345 VMOVU (%rsi), %VEC(0)
346 VMOVU VEC_SIZE(%rsi), %VEC(1)
347 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
348 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
349 VMOVU %VEC(0), (%rdi)
350 VMOVU %VEC(1), VEC_SIZE(%rdi)
351 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
352 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
353 VZEROUPPER
354 ret
355
356 L(more_8x_vec):
357 cmpq %rsi, %rdi
358 ja L(more_8x_vec_backward)
359 /* Source == destination is less common. */
360 je L(nop)
361 /* Load the first VEC and last 4 * VEC to support overlapping
362 addresses. */
363 VMOVU (%rsi), %VEC(4)
364 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
365 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
366 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
367 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
368 /* Save start and stop of the destination buffer. */
369 movq %rdi, %r11
370 leaq -VEC_SIZE(%rdi, %rdx), %rcx
371 /* Align destination for aligned stores in the loop. Compute
372 how much destination is misaligned. */
373 movq %rdi, %r8
374 andq $(VEC_SIZE - 1), %r8
375 /* Get the negative of offset for alignment. */
376 subq $VEC_SIZE, %r8
377 /* Adjust source. */
378 subq %r8, %rsi
379 /* Adjust destination which should be aligned now. */
380 subq %r8, %rdi
381 /* Adjust length. */
382 addq %r8, %rdx
383 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
384 /* Check non-temporal store threshold. */
385 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
386 ja L(large_forward)
387 #endif
388 L(loop_4x_vec_forward):
389 /* Copy 4 * VEC a time forward. */
390 VMOVU (%rsi), %VEC(0)
391 VMOVU VEC_SIZE(%rsi), %VEC(1)
392 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
393 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
394 addq $(VEC_SIZE * 4), %rsi
395 subq $(VEC_SIZE * 4), %rdx
396 VMOVA %VEC(0), (%rdi)
397 VMOVA %VEC(1), VEC_SIZE(%rdi)
398 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
399 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
400 addq $(VEC_SIZE * 4), %rdi
401 cmpq $(VEC_SIZE * 4), %rdx
402 ja L(loop_4x_vec_forward)
403 /* Store the last 4 * VEC. */
404 VMOVU %VEC(5), (%rcx)
405 VMOVU %VEC(6), -VEC_SIZE(%rcx)
406 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
407 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
408 /* Store the first VEC. */
409 VMOVU %VEC(4), (%r11)
410 VZEROUPPER
411 ret
412
413 L(more_8x_vec_backward):
414 /* Load the first 4 * VEC and last VEC to support overlapping
415 addresses. */
416 VMOVU (%rsi), %VEC(4)
417 VMOVU VEC_SIZE(%rsi), %VEC(5)
418 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
419 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
420 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
421 /* Save stop of the destination buffer. */
422 leaq -VEC_SIZE(%rdi, %rdx), %r11
423 /* Align destination end for aligned stores in the loop. Compute
424 how much destination end is misaligned. */
425 leaq -VEC_SIZE(%rsi, %rdx), %rcx
426 movq %r11, %r9
427 movq %r11, %r8
428 andq $(VEC_SIZE - 1), %r8
429 /* Adjust source. */
430 subq %r8, %rcx
431 /* Adjust the end of destination which should be aligned now. */
432 subq %r8, %r9
433 /* Adjust length. */
434 subq %r8, %rdx
435 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
436 /* Check non-temporal store threshold. */
437 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
438 ja L(large_backward)
439 #endif
440 L(loop_4x_vec_backward):
441 /* Copy 4 * VEC a time backward. */
442 VMOVU (%rcx), %VEC(0)
443 VMOVU -VEC_SIZE(%rcx), %VEC(1)
444 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
445 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
446 subq $(VEC_SIZE * 4), %rcx
447 subq $(VEC_SIZE * 4), %rdx
448 VMOVA %VEC(0), (%r9)
449 VMOVA %VEC(1), -VEC_SIZE(%r9)
450 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
451 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
452 subq $(VEC_SIZE * 4), %r9
453 cmpq $(VEC_SIZE * 4), %rdx
454 ja L(loop_4x_vec_backward)
455 /* Store the first 4 * VEC. */
456 VMOVU %VEC(4), (%rdi)
457 VMOVU %VEC(5), VEC_SIZE(%rdi)
458 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
459 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
460 /* Store the last VEC. */
461 VMOVU %VEC(8), (%r11)
462 VZEROUPPER
463 ret
464
465 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
466 L(large_forward):
467 /* Don't use non-temporal store if there is overlap between
468 destination and source since destination may be in cache
469 when source is loaded. */
470 leaq (%rdi, %rdx), %r10
471 cmpq %r10, %rsi
472 jb L(loop_4x_vec_forward)
473 L(loop_large_forward):
474 /* Copy 4 * VEC a time forward with non-temporal stores. */
475 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
476 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
477 VMOVU (%rsi), %VEC(0)
478 VMOVU VEC_SIZE(%rsi), %VEC(1)
479 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
480 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
481 addq $PREFETCHED_LOAD_SIZE, %rsi
482 subq $PREFETCHED_LOAD_SIZE, %rdx
483 VMOVNT %VEC(0), (%rdi)
484 VMOVNT %VEC(1), VEC_SIZE(%rdi)
485 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
486 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
487 addq $PREFETCHED_LOAD_SIZE, %rdi
488 cmpq $PREFETCHED_LOAD_SIZE, %rdx
489 ja L(loop_large_forward)
490 sfence
491 /* Store the last 4 * VEC. */
492 VMOVU %VEC(5), (%rcx)
493 VMOVU %VEC(6), -VEC_SIZE(%rcx)
494 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
495 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
496 /* Store the first VEC. */
497 VMOVU %VEC(4), (%r11)
498 VZEROUPPER
499 ret
500
501 L(large_backward):
502 /* Don't use non-temporal store if there is overlap between
503 destination and source since destination may be in cache
504 when source is loaded. */
505 leaq (%rcx, %rdx), %r10
506 cmpq %r10, %r9
507 jb L(loop_4x_vec_backward)
508 L(loop_large_backward):
509 /* Copy 4 * VEC a time backward with non-temporal stores. */
510 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
511 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
512 VMOVU (%rcx), %VEC(0)
513 VMOVU -VEC_SIZE(%rcx), %VEC(1)
514 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
515 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
516 subq $PREFETCHED_LOAD_SIZE, %rcx
517 subq $PREFETCHED_LOAD_SIZE, %rdx
518 VMOVNT %VEC(0), (%r9)
519 VMOVNT %VEC(1), -VEC_SIZE(%r9)
520 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
521 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
522 subq $PREFETCHED_LOAD_SIZE, %r9
523 cmpq $PREFETCHED_LOAD_SIZE, %rdx
524 ja L(loop_large_backward)
525 sfence
526 /* Store the first 4 * VEC. */
527 VMOVU %VEC(4), (%rdi)
528 VMOVU %VEC(5), VEC_SIZE(%rdi)
529 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
530 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
531 /* Store the last VEC. */
532 VMOVU %VEC(8), (%r11)
533 VZEROUPPER
534 ret
535 #endif
536 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
537
538 #ifdef SHARED
539 # if IS_IN (libc)
540 # ifdef USE_MULTIARCH
541 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
542 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
543 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
544 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
545 # endif
546 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
547 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
548 # endif
549 #endif
550 #if VEC_SIZE == 16 || defined SHARED
551 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
552 MEMCPY_SYMBOL (__memcpy, unaligned))
553 #endif