]>
Commit | Line | Data |
---|---|---|
88b57b8e | 1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2b778ceb | 2 | Copyright (C) 2016-2021 Free Software Foundation, Inc. |
88b57b8e L |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
88b57b8e L |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: | |
20 | 1. Use overlapping load and store to avoid branch. | |
a057f5f8 | 21 | 2. Load all sources into registers and store them together to avoid |
3f61232a | 22 | possible address overlap between source and destination. |
a057f5f8 | 23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
88b57b8e | 24 | and store them together. |
a057f5f8 L |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. | |
27 | Load the first 4 * VEC and last VEC before the loop and store | |
28 | them after the loop to support overlapping addresses. | |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned | |
30 | load and aligned store. Load the last 4 * VEC and first VEC | |
31 | before the loop and store them after the loop to support | |
32 | overlapping addresses. | |
33 | 6. If size >= __x86_shared_non_temporal_threshold and there is no | |
34 | overlap between destination and source, use non-temporal store | |
35 | instead of aligned store. */ | |
88b57b8e | 36 | |
a7d1c514 | 37 | #include <sysdep.h> |
88b57b8e | 38 | |
a7d1c514 L |
39 | #ifndef MEMCPY_SYMBOL |
40 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) | |
41 | #endif | |
88b57b8e | 42 | |
a7d1c514 L |
43 | #ifndef MEMPCPY_SYMBOL |
44 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) | |
45 | #endif | |
46 | ||
47 | #ifndef MEMMOVE_CHK_SYMBOL | |
48 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) | |
49 | #endif | |
50 | ||
51 | #ifndef VZEROUPPER | |
52 | # if VEC_SIZE > 16 | |
53 | # define VZEROUPPER vzeroupper | |
54 | # else | |
55 | # define VZEROUPPER | |
88b57b8e | 56 | # endif |
a7d1c514 | 57 | #endif |
88b57b8e | 58 | |
a057f5f8 L |
59 | #ifndef PREFETCH |
60 | # define PREFETCH(addr) prefetcht0 addr | |
61 | #endif | |
62 | ||
63 | /* Assume 64-byte prefetch size. */ | |
64 | #ifndef PREFETCH_SIZE | |
65 | # define PREFETCH_SIZE 64 | |
66 | #endif | |
67 | ||
68 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) | |
69 | ||
70 | #if PREFETCH_SIZE == 64 | |
71 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE | |
72 | # define PREFETCH_ONE_SET(dir, base, offset) \ | |
73 | PREFETCH ((offset)base) | |
74 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE | |
75 | # define PREFETCH_ONE_SET(dir, base, offset) \ | |
76 | PREFETCH ((offset)base); \ | |
77 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) | |
78 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE | |
79 | # define PREFETCH_ONE_SET(dir, base, offset) \ | |
80 | PREFETCH ((offset)base); \ | |
81 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ | |
a057f5f8 L |
82 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
83 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) | |
84 | # else | |
85 | # error Unsupported PREFETCHED_LOAD_SIZE! | |
86 | # endif | |
87 | #else | |
88 | # error Unsupported PREFETCH_SIZE! | |
89 | #endif | |
90 | ||
a7d1c514 L |
91 | #ifndef SECTION |
92 | # error SECTION is not defined! | |
93 | #endif | |
88b57b8e | 94 | |
a7d1c514 L |
95 | .section SECTION(.text),"ax",@progbits |
96 | #if defined SHARED && IS_IN (libc) | |
c867597b | 97 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
231c5676 | 98 | cmp %RDX_LP, %RCX_LP |
88b57b8e | 99 | jb HIDDEN_JUMPTARGET (__chk_fail) |
c867597b | 100 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
a7d1c514 | 101 | #endif |
88b57b8e | 102 | |
c867597b | 103 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
231c5676 L |
104 | mov %RDI_LP, %RAX_LP |
105 | add %RDX_LP, %RAX_LP | |
88b57b8e | 106 | jmp L(start) |
c867597b | 107 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
88b57b8e | 108 | |
a7d1c514 | 109 | #if defined SHARED && IS_IN (libc) |
c867597b | 110 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
231c5676 | 111 | cmp %RDX_LP, %RCX_LP |
88b57b8e | 112 | jb HIDDEN_JUMPTARGET (__chk_fail) |
c867597b | 113 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
a7d1c514 | 114 | #endif |
88b57b8e | 115 | |
c867597b | 116 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
88b57b8e L |
117 | movq %rdi, %rax |
118 | L(start): | |
231c5676 L |
119 | # ifdef __ILP32__ |
120 | /* Clear the upper 32 bits. */ | |
121 | movl %edx, %edx | |
122 | # endif | |
123 | cmp $VEC_SIZE, %RDX_LP | |
88b57b8e | 124 | jb L(less_vec) |
231c5676 | 125 | cmp $(VEC_SIZE * 2), %RDX_LP |
88b57b8e | 126 | ja L(more_2x_vec) |
a7d1c514 L |
127 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
128 | L(last_2x_vec): | |
129 | #endif | |
88b57b8e L |
130 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
131 | VMOVU (%rsi), %VEC(0) | |
132 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) | |
133 | VMOVU %VEC(0), (%rdi) | |
134 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) | |
135 | VZEROUPPER | |
a7d1c514 L |
136 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
137 | L(nop): | |
138 | #endif | |
88b57b8e | 139 | ret |
a7d1c514 | 140 | #if defined USE_MULTIARCH && IS_IN (libc) |
c867597b | 141 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
88b57b8e | 142 | |
13efa86e | 143 | # if VEC_SIZE == 16 |
5c3e322d | 144 | ENTRY (__mempcpy_chk_erms) |
231c5676 | 145 | cmp %RDX_LP, %RCX_LP |
5c3e322d L |
146 | jb HIDDEN_JUMPTARGET (__chk_fail) |
147 | END (__mempcpy_chk_erms) | |
148 | ||
88b57b8e | 149 | /* Only used to measure performance of REP MOVSB. */ |
88b57b8e | 150 | ENTRY (__mempcpy_erms) |
231c5676 | 151 | mov %RDI_LP, %RAX_LP |
727b38df | 152 | /* Skip zero length. */ |
231c5676 | 153 | test %RDX_LP, %RDX_LP |
727b38df | 154 | jz 2f |
231c5676 | 155 | add %RDX_LP, %RAX_LP |
ea2785e9 | 156 | jmp L(start_movsb) |
88b57b8e | 157 | END (__mempcpy_erms) |
88b57b8e | 158 | |
5c3e322d | 159 | ENTRY (__memmove_chk_erms) |
231c5676 | 160 | cmp %RDX_LP, %RCX_LP |
5c3e322d L |
161 | jb HIDDEN_JUMPTARGET (__chk_fail) |
162 | END (__memmove_chk_erms) | |
163 | ||
88b57b8e L |
164 | ENTRY (__memmove_erms) |
165 | movq %rdi, %rax | |
727b38df | 166 | /* Skip zero length. */ |
231c5676 | 167 | test %RDX_LP, %RDX_LP |
727b38df | 168 | jz 2f |
ea2785e9 | 169 | L(start_movsb): |
231c5676 L |
170 | mov %RDX_LP, %RCX_LP |
171 | cmp %RSI_LP, %RDI_LP | |
ea2785e9 L |
172 | jb 1f |
173 | /* Source == destination is less common. */ | |
174 | je 2f | |
231c5676 L |
175 | lea (%rsi,%rcx), %RDX_LP |
176 | cmp %RDX_LP, %RDI_LP | |
88b57b8e L |
177 | jb L(movsb_backward) |
178 | 1: | |
179 | rep movsb | |
ea2785e9 | 180 | 2: |
88b57b8e L |
181 | ret |
182 | L(movsb_backward): | |
183 | leaq -1(%rdi,%rcx), %rdi | |
184 | leaq -1(%rsi,%rcx), %rsi | |
185 | std | |
186 | rep movsb | |
187 | cld | |
188 | ret | |
189 | END (__memmove_erms) | |
190 | strong_alias (__memmove_erms, __memcpy_erms) | |
5c3e322d | 191 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |
88b57b8e L |
192 | # endif |
193 | ||
ea2785e9 | 194 | # ifdef SHARED |
a7d1c514 | 195 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
231c5676 | 196 | cmp %RDX_LP, %RCX_LP |
ea2785e9 | 197 | jb HIDDEN_JUMPTARGET (__chk_fail) |
a7d1c514 | 198 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
fc11ff8d | 199 | # endif |
ea2785e9 L |
200 | |
201 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) | |
231c5676 L |
202 | mov %RDI_LP, %RAX_LP |
203 | add %RDX_LP, %RAX_LP | |
ea2785e9 L |
204 | jmp L(start_erms) |
205 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) | |
206 | ||
fc11ff8d | 207 | # ifdef SHARED |
a7d1c514 | 208 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
231c5676 | 209 | cmp %RDX_LP, %RCX_LP |
ea2785e9 | 210 | jb HIDDEN_JUMPTARGET (__chk_fail) |
a7d1c514 | 211 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
ea2785e9 L |
212 | # endif |
213 | ||
88b57b8e L |
214 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
215 | movq %rdi, %rax | |
216 | L(start_erms): | |
231c5676 L |
217 | # ifdef __ILP32__ |
218 | /* Clear the upper 32 bits. */ | |
219 | movl %edx, %edx | |
220 | # endif | |
221 | cmp $VEC_SIZE, %RDX_LP | |
88b57b8e | 222 | jb L(less_vec) |
231c5676 | 223 | cmp $(VEC_SIZE * 2), %RDX_LP |
88b57b8e L |
224 | ja L(movsb_more_2x_vec) |
225 | L(last_2x_vec): | |
226 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ | |
227 | VMOVU (%rsi), %VEC(0) | |
228 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) | |
229 | VMOVU %VEC(0), (%rdi) | |
230 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) | |
231 | L(return): | |
232 | VZEROUPPER | |
233 | ret | |
234 | ||
235 | L(movsb): | |
55c7bcc7 | 236 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
a057f5f8 | 237 | jae L(more_8x_vec) |
88b57b8e | 238 | cmpq %rsi, %rdi |
88b57b8e | 239 | jb 1f |
ea2785e9 L |
240 | /* Source == destination is less common. */ |
241 | je L(nop) | |
88b57b8e L |
242 | leaq (%rsi,%rdx), %r9 |
243 | cmpq %r9, %rdi | |
244 | /* Avoid slow backward REP MOVSB. */ | |
88b57b8e L |
245 | jb L(more_8x_vec_backward) |
246 | 1: | |
231c5676 | 247 | mov %RDX_LP, %RCX_LP |
88b57b8e L |
248 | rep movsb |
249 | L(nop): | |
250 | ret | |
a7d1c514 | 251 | #endif |
a057f5f8 | 252 | |
88b57b8e L |
253 | L(less_vec): |
254 | /* Less than 1 VEC. */ | |
a7d1c514 L |
255 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
256 | # error Unsupported VEC_SIZE! | |
257 | #endif | |
258 | #if VEC_SIZE > 32 | |
88b57b8e L |
259 | cmpb $32, %dl |
260 | jae L(between_32_63) | |
a7d1c514 L |
261 | #endif |
262 | #if VEC_SIZE > 16 | |
88b57b8e L |
263 | cmpb $16, %dl |
264 | jae L(between_16_31) | |
a7d1c514 | 265 | #endif |
88b57b8e L |
266 | cmpb $8, %dl |
267 | jae L(between_8_15) | |
268 | cmpb $4, %dl | |
269 | jae L(between_4_7) | |
270 | cmpb $1, %dl | |
271 | ja L(between_2_3) | |
272 | jb 1f | |
273 | movzbl (%rsi), %ecx | |
274 | movb %cl, (%rdi) | |
275 | 1: | |
276 | ret | |
a7d1c514 | 277 | #if VEC_SIZE > 32 |
88b57b8e L |
278 | L(between_32_63): |
279 | /* From 32 to 63. No branch when size == 32. */ | |
280 | vmovdqu (%rsi), %ymm0 | |
281 | vmovdqu -32(%rsi,%rdx), %ymm1 | |
282 | vmovdqu %ymm0, (%rdi) | |
283 | vmovdqu %ymm1, -32(%rdi,%rdx) | |
284 | VZEROUPPER | |
285 | ret | |
a7d1c514 L |
286 | #endif |
287 | #if VEC_SIZE > 16 | |
88b57b8e L |
288 | /* From 16 to 31. No branch when size == 16. */ |
289 | L(between_16_31): | |
290 | vmovdqu (%rsi), %xmm0 | |
291 | vmovdqu -16(%rsi,%rdx), %xmm1 | |
292 | vmovdqu %xmm0, (%rdi) | |
293 | vmovdqu %xmm1, -16(%rdi,%rdx) | |
294 | ret | |
a7d1c514 | 295 | #endif |
88b57b8e L |
296 | L(between_8_15): |
297 | /* From 8 to 15. No branch when size == 8. */ | |
298 | movq -8(%rsi,%rdx), %rcx | |
299 | movq (%rsi), %rsi | |
300 | movq %rcx, -8(%rdi,%rdx) | |
301 | movq %rsi, (%rdi) | |
302 | ret | |
303 | L(between_4_7): | |
304 | /* From 4 to 7. No branch when size == 4. */ | |
305 | movl -4(%rsi,%rdx), %ecx | |
306 | movl (%rsi), %esi | |
307 | movl %ecx, -4(%rdi,%rdx) | |
308 | movl %esi, (%rdi) | |
309 | ret | |
310 | L(between_2_3): | |
311 | /* From 2 to 3. No branch when size == 2. */ | |
312 | movzwl -2(%rsi,%rdx), %ecx | |
313 | movzwl (%rsi), %esi | |
314 | movw %cx, -2(%rdi,%rdx) | |
315 | movw %si, (%rdi) | |
316 | ret | |
317 | ||
a057f5f8 L |
318 | #if defined USE_MULTIARCH && IS_IN (libc) |
319 | L(movsb_more_2x_vec): | |
3f4b61a0 | 320 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
a057f5f8 | 321 | ja L(movsb) |
a7d1c514 | 322 | #endif |
a057f5f8 L |
323 | L(more_2x_vec): |
324 | /* More than 2 * VEC and there may be overlap between destination | |
88b57b8e L |
325 | and source. */ |
326 | cmpq $(VEC_SIZE * 8), %rdx | |
327 | ja L(more_8x_vec) | |
328 | cmpq $(VEC_SIZE * 4), %rdx | |
329 | jb L(last_4x_vec) | |
88b57b8e L |
330 | /* Copy from 4 * VEC to 8 * VEC, inclusively. */ |
331 | VMOVU (%rsi), %VEC(0) | |
332 | VMOVU VEC_SIZE(%rsi), %VEC(1) | |
333 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) | |
334 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) | |
335 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) | |
336 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) | |
337 | VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) | |
338 | VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) | |
339 | VMOVU %VEC(0), (%rdi) | |
340 | VMOVU %VEC(1), VEC_SIZE(%rdi) | |
341 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) | |
342 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) | |
343 | VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) | |
344 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) | |
345 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) | |
346 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) | |
347 | VZEROUPPER | |
348 | ret | |
349 | L(last_4x_vec): | |
350 | /* Copy from 2 * VEC to 4 * VEC. */ | |
351 | VMOVU (%rsi), %VEC(0) | |
352 | VMOVU VEC_SIZE(%rsi), %VEC(1) | |
353 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) | |
354 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) | |
355 | VMOVU %VEC(0), (%rdi) | |
356 | VMOVU %VEC(1), VEC_SIZE(%rdi) | |
357 | VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) | |
358 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) | |
359 | VZEROUPPER | |
360 | ret | |
a057f5f8 | 361 | |
88b57b8e L |
362 | L(more_8x_vec): |
363 | cmpq %rsi, %rdi | |
364 | ja L(more_8x_vec_backward) | |
a057f5f8 L |
365 | /* Source == destination is less common. */ |
366 | je L(nop) | |
367 | /* Load the first VEC and last 4 * VEC to support overlapping | |
368 | addresses. */ | |
369 | VMOVU (%rsi), %VEC(4) | |
370 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) | |
371 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) | |
372 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) | |
373 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) | |
374 | /* Save start and stop of the destination buffer. */ | |
375 | movq %rdi, %r11 | |
376 | leaq -VEC_SIZE(%rdi, %rdx), %rcx | |
377 | /* Align destination for aligned stores in the loop. Compute | |
378 | how much destination is misaligned. */ | |
379 | movq %rdi, %r8 | |
380 | andq $(VEC_SIZE - 1), %r8 | |
381 | /* Get the negative of offset for alignment. */ | |
382 | subq $VEC_SIZE, %r8 | |
383 | /* Adjust source. */ | |
384 | subq %r8, %rsi | |
385 | /* Adjust destination which should be aligned now. */ | |
386 | subq %r8, %rdi | |
387 | /* Adjust length. */ | |
388 | addq %r8, %rdx | |
389 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | |
390 | /* Check non-temporal store threshold. */ | |
55c7bcc7 | 391 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
a057f5f8 L |
392 | ja L(large_forward) |
393 | #endif | |
394 | L(loop_4x_vec_forward): | |
395 | /* Copy 4 * VEC a time forward. */ | |
88b57b8e L |
396 | VMOVU (%rsi), %VEC(0) |
397 | VMOVU VEC_SIZE(%rsi), %VEC(1) | |
398 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) | |
399 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) | |
a057f5f8 L |
400 | addq $(VEC_SIZE * 4), %rsi |
401 | subq $(VEC_SIZE * 4), %rdx | |
402 | VMOVA %VEC(0), (%rdi) | |
403 | VMOVA %VEC(1), VEC_SIZE(%rdi) | |
404 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) | |
405 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) | |
406 | addq $(VEC_SIZE * 4), %rdi | |
88b57b8e | 407 | cmpq $(VEC_SIZE * 4), %rdx |
a057f5f8 L |
408 | ja L(loop_4x_vec_forward) |
409 | /* Store the last 4 * VEC. */ | |
410 | VMOVU %VEC(5), (%rcx) | |
411 | VMOVU %VEC(6), -VEC_SIZE(%rcx) | |
412 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) | |
413 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) | |
414 | /* Store the first VEC. */ | |
415 | VMOVU %VEC(4), (%r11) | |
416 | VZEROUPPER | |
417 | ret | |
88b57b8e | 418 | |
88b57b8e | 419 | L(more_8x_vec_backward): |
a057f5f8 L |
420 | /* Load the first 4 * VEC and last VEC to support overlapping |
421 | addresses. */ | |
422 | VMOVU (%rsi), %VEC(4) | |
423 | VMOVU VEC_SIZE(%rsi), %VEC(5) | |
424 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) | |
425 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) | |
426 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) | |
427 | /* Save stop of the destination buffer. */ | |
428 | leaq -VEC_SIZE(%rdi, %rdx), %r11 | |
429 | /* Align destination end for aligned stores in the loop. Compute | |
430 | how much destination end is misaligned. */ | |
88b57b8e | 431 | leaq -VEC_SIZE(%rsi, %rdx), %rcx |
a057f5f8 L |
432 | movq %r11, %r9 |
433 | movq %r11, %r8 | |
434 | andq $(VEC_SIZE - 1), %r8 | |
435 | /* Adjust source. */ | |
436 | subq %r8, %rcx | |
437 | /* Adjust the end of destination which should be aligned now. */ | |
438 | subq %r8, %r9 | |
439 | /* Adjust length. */ | |
440 | subq %r8, %rdx | |
441 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | |
442 | /* Check non-temporal store threshold. */ | |
55c7bcc7 | 443 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
a057f5f8 L |
444 | ja L(large_backward) |
445 | #endif | |
446 | L(loop_4x_vec_backward): | |
447 | /* Copy 4 * VEC a time backward. */ | |
88b57b8e L |
448 | VMOVU (%rcx), %VEC(0) |
449 | VMOVU -VEC_SIZE(%rcx), %VEC(1) | |
450 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) | |
451 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) | |
a057f5f8 L |
452 | subq $(VEC_SIZE * 4), %rcx |
453 | subq $(VEC_SIZE * 4), %rdx | |
454 | VMOVA %VEC(0), (%r9) | |
455 | VMOVA %VEC(1), -VEC_SIZE(%r9) | |
456 | VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) | |
457 | VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) | |
458 | subq $(VEC_SIZE * 4), %r9 | |
88b57b8e | 459 | cmpq $(VEC_SIZE * 4), %rdx |
a057f5f8 L |
460 | ja L(loop_4x_vec_backward) |
461 | /* Store the first 4 * VEC. */ | |
462 | VMOVU %VEC(4), (%rdi) | |
463 | VMOVU %VEC(5), VEC_SIZE(%rdi) | |
464 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) | |
465 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) | |
466 | /* Store the last VEC. */ | |
467 | VMOVU %VEC(8), (%r11) | |
468 | VZEROUPPER | |
469 | ret | |
470 | ||
471 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | |
472 | L(large_forward): | |
473 | /* Don't use non-temporal store if there is overlap between | |
474 | destination and source since destination may be in cache | |
475 | when source is loaded. */ | |
476 | leaq (%rdi, %rdx), %r10 | |
477 | cmpq %r10, %rsi | |
478 | jb L(loop_4x_vec_forward) | |
479 | L(loop_large_forward): | |
480 | /* Copy 4 * VEC a time forward with non-temporal stores. */ | |
481 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) | |
482 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) | |
483 | VMOVU (%rsi), %VEC(0) | |
484 | VMOVU VEC_SIZE(%rsi), %VEC(1) | |
485 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) | |
486 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) | |
487 | addq $PREFETCHED_LOAD_SIZE, %rsi | |
488 | subq $PREFETCHED_LOAD_SIZE, %rdx | |
489 | VMOVNT %VEC(0), (%rdi) | |
490 | VMOVNT %VEC(1), VEC_SIZE(%rdi) | |
491 | VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) | |
492 | VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) | |
493 | addq $PREFETCHED_LOAD_SIZE, %rdi | |
494 | cmpq $PREFETCHED_LOAD_SIZE, %rdx | |
495 | ja L(loop_large_forward) | |
496 | sfence | |
497 | /* Store the last 4 * VEC. */ | |
498 | VMOVU %VEC(5), (%rcx) | |
499 | VMOVU %VEC(6), -VEC_SIZE(%rcx) | |
500 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) | |
501 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) | |
502 | /* Store the first VEC. */ | |
503 | VMOVU %VEC(4), (%r11) | |
504 | VZEROUPPER | |
505 | ret | |
506 | ||
507 | L(large_backward): | |
508 | /* Don't use non-temporal store if there is overlap between | |
509 | destination and source since destination may be in cache | |
510 | when source is loaded. */ | |
511 | leaq (%rcx, %rdx), %r10 | |
512 | cmpq %r10, %r9 | |
513 | jb L(loop_4x_vec_backward) | |
514 | L(loop_large_backward): | |
515 | /* Copy 4 * VEC a time backward with non-temporal stores. */ | |
516 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) | |
517 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) | |
518 | VMOVU (%rcx), %VEC(0) | |
519 | VMOVU -VEC_SIZE(%rcx), %VEC(1) | |
520 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) | |
521 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) | |
522 | subq $PREFETCHED_LOAD_SIZE, %rcx | |
523 | subq $PREFETCHED_LOAD_SIZE, %rdx | |
524 | VMOVNT %VEC(0), (%r9) | |
525 | VMOVNT %VEC(1), -VEC_SIZE(%r9) | |
526 | VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) | |
527 | VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) | |
528 | subq $PREFETCHED_LOAD_SIZE, %r9 | |
529 | cmpq $PREFETCHED_LOAD_SIZE, %rdx | |
530 | ja L(loop_large_backward) | |
531 | sfence | |
532 | /* Store the first 4 * VEC. */ | |
533 | VMOVU %VEC(4), (%rdi) | |
534 | VMOVU %VEC(5), VEC_SIZE(%rdi) | |
535 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) | |
536 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) | |
537 | /* Store the last VEC. */ | |
538 | VMOVU %VEC(8), (%r11) | |
539 | VZEROUPPER | |
540 | ret | |
541 | #endif | |
88b57b8e L |
542 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
543 | ||
fc11ff8d L |
544 | #if IS_IN (libc) |
545 | # ifdef USE_MULTIARCH | |
88b57b8e L |
546 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
547 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) | |
fc11ff8d | 548 | # ifdef SHARED |
88b57b8e L |
549 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
550 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) | |
a7d1c514 | 551 | # endif |
fc11ff8d L |
552 | # endif |
553 | # ifdef SHARED | |
c867597b L |
554 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
555 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) | |
88b57b8e | 556 | # endif |
a7d1c514 | 557 | #endif |
c867597b L |
558 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
559 | MEMCPY_SYMBOL (__memcpy, unaligned)) |