]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
5aaadc233f1ea8f4fe226e0b2096856879448ee9
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-vec-unaligned-erms.S
1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37 #include <sysdep.h>
38
39 #ifndef MEMCPY_SYMBOL
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41 #endif
42
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45 #endif
46
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49 #endif
50
51 #ifndef VZEROUPPER
52 # if VEC_SIZE > 16
53 # define VZEROUPPER vzeroupper
54 # else
55 # define VZEROUPPER
56 # endif
57 #endif
58
59 /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66 #ifndef REP_MOVSB_THRESHOLD
67 # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68 #endif
69
70 /* Avoid short distance rep movsb only with non-SSE vector. */
71 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
72 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
73 #else
74 # define AVOID_SHORT_DISTANCE_REP_MOVSB 0
75 #endif
76
77 #ifndef PREFETCH
78 # define PREFETCH(addr) prefetcht0 addr
79 #endif
80
81 /* Assume 64-byte prefetch size. */
82 #ifndef PREFETCH_SIZE
83 # define PREFETCH_SIZE 64
84 #endif
85
86 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
87
88 #if PREFETCH_SIZE == 64
89 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
90 # define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base)
92 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
93 # define PREFETCH_ONE_SET(dir, base, offset) \
94 PREFETCH ((offset)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
96 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
97 # define PREFETCH_ONE_SET(dir, base, offset) \
98 PREFETCH ((offset)base); \
99 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
100 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
101 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
102 # else
103 # error Unsupported PREFETCHED_LOAD_SIZE!
104 # endif
105 #else
106 # error Unsupported PREFETCH_SIZE!
107 #endif
108
109 #ifndef SECTION
110 # error SECTION is not defined!
111 #endif
112
113 .section SECTION(.text),"ax",@progbits
114 #if defined SHARED && IS_IN (libc)
115 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
116 cmp %RDX_LP, %RCX_LP
117 jb HIDDEN_JUMPTARGET (__chk_fail)
118 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
119 #endif
120
121 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
122 mov %RDI_LP, %RAX_LP
123 add %RDX_LP, %RAX_LP
124 jmp L(start)
125 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
126
127 #if defined SHARED && IS_IN (libc)
128 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
129 cmp %RDX_LP, %RCX_LP
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
132 #endif
133
134 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
135 movq %rdi, %rax
136 L(start):
137 # ifdef __ILP32__
138 /* Clear the upper 32 bits. */
139 movl %edx, %edx
140 # endif
141 cmp $VEC_SIZE, %RDX_LP
142 jb L(less_vec)
143 cmp $(VEC_SIZE * 2), %RDX_LP
144 ja L(more_2x_vec)
145 #if !defined USE_MULTIARCH || !IS_IN (libc)
146 L(last_2x_vec):
147 #endif
148 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
149 VMOVU (%rsi), %VEC(0)
150 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
151 VMOVU %VEC(0), (%rdi)
152 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
153 VZEROUPPER
154 #if !defined USE_MULTIARCH || !IS_IN (libc)
155 L(nop):
156 #endif
157 ret
158 #if defined USE_MULTIARCH && IS_IN (libc)
159 END (MEMMOVE_SYMBOL (__memmove, unaligned))
160
161 # if VEC_SIZE == 16
162 ENTRY (__mempcpy_chk_erms)
163 cmp %RDX_LP, %RCX_LP
164 jb HIDDEN_JUMPTARGET (__chk_fail)
165 END (__mempcpy_chk_erms)
166
167 /* Only used to measure performance of REP MOVSB. */
168 ENTRY (__mempcpy_erms)
169 mov %RDI_LP, %RAX_LP
170 /* Skip zero length. */
171 test %RDX_LP, %RDX_LP
172 jz 2f
173 add %RDX_LP, %RAX_LP
174 jmp L(start_movsb)
175 END (__mempcpy_erms)
176
177 ENTRY (__memmove_chk_erms)
178 cmp %RDX_LP, %RCX_LP
179 jb HIDDEN_JUMPTARGET (__chk_fail)
180 END (__memmove_chk_erms)
181
182 ENTRY (__memmove_erms)
183 movq %rdi, %rax
184 /* Skip zero length. */
185 test %RDX_LP, %RDX_LP
186 jz 2f
187 L(start_movsb):
188 mov %RDX_LP, %RCX_LP
189 cmp %RSI_LP, %RDI_LP
190 jb 1f
191 /* Source == destination is less common. */
192 je 2f
193 lea (%rsi,%rcx), %RDX_LP
194 cmp %RDX_LP, %RDI_LP
195 jb L(movsb_backward)
196 1:
197 rep movsb
198 2:
199 ret
200 L(movsb_backward):
201 leaq -1(%rdi,%rcx), %rdi
202 leaq -1(%rsi,%rcx), %rsi
203 std
204 rep movsb
205 cld
206 ret
207 END (__memmove_erms)
208 strong_alias (__memmove_erms, __memcpy_erms)
209 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
210 # endif
211
212 # ifdef SHARED
213 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
214 cmp %RDX_LP, %RCX_LP
215 jb HIDDEN_JUMPTARGET (__chk_fail)
216 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
217 # endif
218
219 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
220 mov %RDI_LP, %RAX_LP
221 add %RDX_LP, %RAX_LP
222 jmp L(start_erms)
223 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
224
225 # ifdef SHARED
226 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
227 cmp %RDX_LP, %RCX_LP
228 jb HIDDEN_JUMPTARGET (__chk_fail)
229 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
230 # endif
231
232 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
233 movq %rdi, %rax
234 L(start_erms):
235 # ifdef __ILP32__
236 /* Clear the upper 32 bits. */
237 movl %edx, %edx
238 # endif
239 cmp $VEC_SIZE, %RDX_LP
240 jb L(less_vec)
241 cmp $(VEC_SIZE * 2), %RDX_LP
242 ja L(movsb_more_2x_vec)
243 L(last_2x_vec):
244 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
245 VMOVU (%rsi), %VEC(0)
246 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
247 VMOVU %VEC(0), (%rdi)
248 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
249 L(return):
250 VZEROUPPER
251 ret
252
253 L(movsb):
254 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
255 jae L(more_8x_vec)
256 cmpq %rsi, %rdi
257 jb 1f
258 /* Source == destination is less common. */
259 je L(nop)
260 leaq (%rsi,%rdx), %r9
261 cmpq %r9, %rdi
262 /* Avoid slow backward REP MOVSB. */
263 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
264 # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
265 # endif
266 jb L(more_8x_vec_backward)
267 # if AVOID_SHORT_DISTANCE_REP_MOVSB
268 movq %rdi, %rcx
269 subq %rsi, %rcx
270 jmp 2f
271 # endif
272 1:
273 # if AVOID_SHORT_DISTANCE_REP_MOVSB
274 movq %rsi, %rcx
275 subq %rdi, %rcx
276 2:
277 /* Avoid "rep movsb" if RCX, the distance between source and destination,
278 is N*4GB + [1..63] with N >= 0. */
279 cmpl $63, %ecx
280 jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
281 # endif
282 mov %RDX_LP, %RCX_LP
283 rep movsb
284 L(nop):
285 ret
286 #endif
287
288 L(less_vec):
289 /* Less than 1 VEC. */
290 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
291 # error Unsupported VEC_SIZE!
292 #endif
293 #if VEC_SIZE > 32
294 cmpb $32, %dl
295 jae L(between_32_63)
296 #endif
297 #if VEC_SIZE > 16
298 cmpb $16, %dl
299 jae L(between_16_31)
300 #endif
301 cmpb $8, %dl
302 jae L(between_8_15)
303 cmpb $4, %dl
304 jae L(between_4_7)
305 cmpb $1, %dl
306 ja L(between_2_3)
307 jb 1f
308 movzbl (%rsi), %ecx
309 movb %cl, (%rdi)
310 1:
311 ret
312 #if VEC_SIZE > 32
313 L(between_32_63):
314 /* From 32 to 63. No branch when size == 32. */
315 vmovdqu (%rsi), %ymm0
316 vmovdqu -32(%rsi,%rdx), %ymm1
317 vmovdqu %ymm0, (%rdi)
318 vmovdqu %ymm1, -32(%rdi,%rdx)
319 VZEROUPPER
320 ret
321 #endif
322 #if VEC_SIZE > 16
323 /* From 16 to 31. No branch when size == 16. */
324 L(between_16_31):
325 vmovdqu (%rsi), %xmm0
326 vmovdqu -16(%rsi,%rdx), %xmm1
327 vmovdqu %xmm0, (%rdi)
328 vmovdqu %xmm1, -16(%rdi,%rdx)
329 ret
330 #endif
331 L(between_8_15):
332 /* From 8 to 15. No branch when size == 8. */
333 movq -8(%rsi,%rdx), %rcx
334 movq (%rsi), %rsi
335 movq %rcx, -8(%rdi,%rdx)
336 movq %rsi, (%rdi)
337 ret
338 L(between_4_7):
339 /* From 4 to 7. No branch when size == 4. */
340 movl -4(%rsi,%rdx), %ecx
341 movl (%rsi), %esi
342 movl %ecx, -4(%rdi,%rdx)
343 movl %esi, (%rdi)
344 ret
345 L(between_2_3):
346 /* From 2 to 3. No branch when size == 2. */
347 movzwl -2(%rsi,%rdx), %ecx
348 movzwl (%rsi), %esi
349 movw %cx, -2(%rdi,%rdx)
350 movw %si, (%rdi)
351 ret
352
353 #if defined USE_MULTIARCH && IS_IN (libc)
354 L(movsb_more_2x_vec):
355 cmpq $REP_MOVSB_THRESHOLD, %rdx
356 ja L(movsb)
357 #endif
358 L(more_2x_vec):
359 /* More than 2 * VEC and there may be overlap between destination
360 and source. */
361 cmpq $(VEC_SIZE * 8), %rdx
362 ja L(more_8x_vec)
363 cmpq $(VEC_SIZE * 4), %rdx
364 jb L(last_4x_vec)
365 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
366 VMOVU (%rsi), %VEC(0)
367 VMOVU VEC_SIZE(%rsi), %VEC(1)
368 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
369 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
370 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
371 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
372 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
373 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
374 VMOVU %VEC(0), (%rdi)
375 VMOVU %VEC(1), VEC_SIZE(%rdi)
376 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
377 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
378 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
379 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
380 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
381 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
382 VZEROUPPER
383 ret
384 L(last_4x_vec):
385 /* Copy from 2 * VEC to 4 * VEC. */
386 VMOVU (%rsi), %VEC(0)
387 VMOVU VEC_SIZE(%rsi), %VEC(1)
388 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
389 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
390 VMOVU %VEC(0), (%rdi)
391 VMOVU %VEC(1), VEC_SIZE(%rdi)
392 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
393 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
394 VZEROUPPER
395 ret
396
397 L(more_8x_vec):
398 cmpq %rsi, %rdi
399 ja L(more_8x_vec_backward)
400 /* Source == destination is less common. */
401 je L(nop)
402 /* Load the first VEC and last 4 * VEC to support overlapping
403 addresses. */
404 VMOVU (%rsi), %VEC(4)
405 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
406 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
407 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
408 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
409 /* Save start and stop of the destination buffer. */
410 movq %rdi, %r11
411 leaq -VEC_SIZE(%rdi, %rdx), %rcx
412 /* Align destination for aligned stores in the loop. Compute
413 how much destination is misaligned. */
414 movq %rdi, %r8
415 andq $(VEC_SIZE - 1), %r8
416 /* Get the negative of offset for alignment. */
417 subq $VEC_SIZE, %r8
418 /* Adjust source. */
419 subq %r8, %rsi
420 /* Adjust destination which should be aligned now. */
421 subq %r8, %rdi
422 /* Adjust length. */
423 addq %r8, %rdx
424 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
425 /* Check non-temporal store threshold. */
426 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
427 ja L(large_forward)
428 #endif
429 L(loop_4x_vec_forward):
430 /* Copy 4 * VEC a time forward. */
431 VMOVU (%rsi), %VEC(0)
432 VMOVU VEC_SIZE(%rsi), %VEC(1)
433 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
434 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
435 addq $(VEC_SIZE * 4), %rsi
436 subq $(VEC_SIZE * 4), %rdx
437 VMOVA %VEC(0), (%rdi)
438 VMOVA %VEC(1), VEC_SIZE(%rdi)
439 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
440 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
441 addq $(VEC_SIZE * 4), %rdi
442 cmpq $(VEC_SIZE * 4), %rdx
443 ja L(loop_4x_vec_forward)
444 /* Store the last 4 * VEC. */
445 VMOVU %VEC(5), (%rcx)
446 VMOVU %VEC(6), -VEC_SIZE(%rcx)
447 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
448 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
449 /* Store the first VEC. */
450 VMOVU %VEC(4), (%r11)
451 VZEROUPPER
452 ret
453
454 L(more_8x_vec_backward):
455 /* Load the first 4 * VEC and last VEC to support overlapping
456 addresses. */
457 VMOVU (%rsi), %VEC(4)
458 VMOVU VEC_SIZE(%rsi), %VEC(5)
459 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
460 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
461 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
462 /* Save stop of the destination buffer. */
463 leaq -VEC_SIZE(%rdi, %rdx), %r11
464 /* Align destination end for aligned stores in the loop. Compute
465 how much destination end is misaligned. */
466 leaq -VEC_SIZE(%rsi, %rdx), %rcx
467 movq %r11, %r9
468 movq %r11, %r8
469 andq $(VEC_SIZE - 1), %r8
470 /* Adjust source. */
471 subq %r8, %rcx
472 /* Adjust the end of destination which should be aligned now. */
473 subq %r8, %r9
474 /* Adjust length. */
475 subq %r8, %rdx
476 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
477 /* Check non-temporal store threshold. */
478 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
479 ja L(large_backward)
480 #endif
481 L(loop_4x_vec_backward):
482 /* Copy 4 * VEC a time backward. */
483 VMOVU (%rcx), %VEC(0)
484 VMOVU -VEC_SIZE(%rcx), %VEC(1)
485 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
486 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
487 subq $(VEC_SIZE * 4), %rcx
488 subq $(VEC_SIZE * 4), %rdx
489 VMOVA %VEC(0), (%r9)
490 VMOVA %VEC(1), -VEC_SIZE(%r9)
491 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
492 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
493 subq $(VEC_SIZE * 4), %r9
494 cmpq $(VEC_SIZE * 4), %rdx
495 ja L(loop_4x_vec_backward)
496 /* Store the first 4 * VEC. */
497 VMOVU %VEC(4), (%rdi)
498 VMOVU %VEC(5), VEC_SIZE(%rdi)
499 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
500 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
501 /* Store the last VEC. */
502 VMOVU %VEC(8), (%r11)
503 VZEROUPPER
504 ret
505
506 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
507 L(large_forward):
508 /* Don't use non-temporal store if there is overlap between
509 destination and source since destination may be in cache
510 when source is loaded. */
511 leaq (%rdi, %rdx), %r10
512 cmpq %r10, %rsi
513 jb L(loop_4x_vec_forward)
514 L(loop_large_forward):
515 /* Copy 4 * VEC a time forward with non-temporal stores. */
516 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
517 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
518 VMOVU (%rsi), %VEC(0)
519 VMOVU VEC_SIZE(%rsi), %VEC(1)
520 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
521 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
522 addq $PREFETCHED_LOAD_SIZE, %rsi
523 subq $PREFETCHED_LOAD_SIZE, %rdx
524 VMOVNT %VEC(0), (%rdi)
525 VMOVNT %VEC(1), VEC_SIZE(%rdi)
526 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
527 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
528 addq $PREFETCHED_LOAD_SIZE, %rdi
529 cmpq $PREFETCHED_LOAD_SIZE, %rdx
530 ja L(loop_large_forward)
531 sfence
532 /* Store the last 4 * VEC. */
533 VMOVU %VEC(5), (%rcx)
534 VMOVU %VEC(6), -VEC_SIZE(%rcx)
535 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
536 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
537 /* Store the first VEC. */
538 VMOVU %VEC(4), (%r11)
539 VZEROUPPER
540 ret
541
542 L(large_backward):
543 /* Don't use non-temporal store if there is overlap between
544 destination and source since destination may be in cache
545 when source is loaded. */
546 leaq (%rcx, %rdx), %r10
547 cmpq %r10, %r9
548 jb L(loop_4x_vec_backward)
549 L(loop_large_backward):
550 /* Copy 4 * VEC a time backward with non-temporal stores. */
551 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
552 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
553 VMOVU (%rcx), %VEC(0)
554 VMOVU -VEC_SIZE(%rcx), %VEC(1)
555 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
556 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
557 subq $PREFETCHED_LOAD_SIZE, %rcx
558 subq $PREFETCHED_LOAD_SIZE, %rdx
559 VMOVNT %VEC(0), (%r9)
560 VMOVNT %VEC(1), -VEC_SIZE(%r9)
561 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
562 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
563 subq $PREFETCHED_LOAD_SIZE, %r9
564 cmpq $PREFETCHED_LOAD_SIZE, %rdx
565 ja L(loop_large_backward)
566 sfence
567 /* Store the first 4 * VEC. */
568 VMOVU %VEC(4), (%rdi)
569 VMOVU %VEC(5), VEC_SIZE(%rdi)
570 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
571 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
572 /* Store the last VEC. */
573 VMOVU %VEC(8), (%r11)
574 VZEROUPPER
575 ret
576 #endif
577 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
578
579 #if IS_IN (libc)
580 # ifdef USE_MULTIARCH
581 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
582 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
583 # ifdef SHARED
584 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
585 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
586 # endif
587 # endif
588 # ifdef SHARED
589 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
590 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
591 # endif
592 #endif
593 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
594 MEMCPY_SYMBOL (__memcpy, unaligned))