]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcpy-avx-unaligned.S
1 /* memcpy with AVX
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #if IS_IN (libc) \
22 && (defined SHARED \
23 || defined USE_AS_MEMMOVE \
24 || !defined USE_MULTIARCH)
25
26 #include "asm-syntax.h"
27 #ifndef MEMCPY
28 # define MEMCPY __memcpy_avx_unaligned
29 # define MEMCPY_CHK __memcpy_chk_avx_unaligned
30 #endif
31
32 .section .text.avx,"ax",@progbits
33 #if !defined USE_AS_BCOPY
34 ENTRY (MEMCPY_CHK)
35 cmpq %rdx, %rcx
36 jb HIDDEN_JUMPTARGET (__chk_fail)
37 END (MEMCPY_CHK)
38 #endif
39
40 ENTRY (MEMCPY)
41 mov %rdi, %rax
42 #ifdef USE_AS_MEMPCPY
43 add %rdx, %rax
44 #endif
45 cmp $256, %rdx
46 jae L(256bytesormore)
47 cmp $16, %dl
48 jb L(less_16bytes)
49 cmp $128, %dl
50 jb L(less_128bytes)
51 vmovdqu (%rsi), %xmm0
52 lea (%rsi, %rdx), %rcx
53 vmovdqu 0x10(%rsi), %xmm1
54 vmovdqu 0x20(%rsi), %xmm2
55 vmovdqu 0x30(%rsi), %xmm3
56 vmovdqu 0x40(%rsi), %xmm4
57 vmovdqu 0x50(%rsi), %xmm5
58 vmovdqu 0x60(%rsi), %xmm6
59 vmovdqu 0x70(%rsi), %xmm7
60 vmovdqu -0x80(%rcx), %xmm8
61 vmovdqu -0x70(%rcx), %xmm9
62 vmovdqu -0x60(%rcx), %xmm10
63 vmovdqu -0x50(%rcx), %xmm11
64 vmovdqu -0x40(%rcx), %xmm12
65 vmovdqu -0x30(%rcx), %xmm13
66 vmovdqu -0x20(%rcx), %xmm14
67 vmovdqu -0x10(%rcx), %xmm15
68 lea (%rdi, %rdx), %rdx
69 vmovdqu %xmm0, (%rdi)
70 vmovdqu %xmm1, 0x10(%rdi)
71 vmovdqu %xmm2, 0x20(%rdi)
72 vmovdqu %xmm3, 0x30(%rdi)
73 vmovdqu %xmm4, 0x40(%rdi)
74 vmovdqu %xmm5, 0x50(%rdi)
75 vmovdqu %xmm6, 0x60(%rdi)
76 vmovdqu %xmm7, 0x70(%rdi)
77 vmovdqu %xmm8, -0x80(%rdx)
78 vmovdqu %xmm9, -0x70(%rdx)
79 vmovdqu %xmm10, -0x60(%rdx)
80 vmovdqu %xmm11, -0x50(%rdx)
81 vmovdqu %xmm12, -0x40(%rdx)
82 vmovdqu %xmm13, -0x30(%rdx)
83 vmovdqu %xmm14, -0x20(%rdx)
84 vmovdqu %xmm15, -0x10(%rdx)
85 ret
86 .p2align 4
87 L(less_128bytes):
88 cmp $64, %dl
89 jb L(less_64bytes)
90 vmovdqu (%rsi), %xmm0
91 lea (%rsi, %rdx), %rcx
92 vmovdqu 0x10(%rsi), %xmm1
93 vmovdqu 0x20(%rsi), %xmm2
94 lea (%rdi, %rdx), %rdx
95 vmovdqu 0x30(%rsi), %xmm3
96 vmovdqu -0x40(%rcx), %xmm4
97 vmovdqu -0x30(%rcx), %xmm5
98 vmovdqu -0x20(%rcx), %xmm6
99 vmovdqu -0x10(%rcx), %xmm7
100 vmovdqu %xmm0, (%rdi)
101 vmovdqu %xmm1, 0x10(%rdi)
102 vmovdqu %xmm2, 0x20(%rdi)
103 vmovdqu %xmm3, 0x30(%rdi)
104 vmovdqu %xmm4, -0x40(%rdx)
105 vmovdqu %xmm5, -0x30(%rdx)
106 vmovdqu %xmm6, -0x20(%rdx)
107 vmovdqu %xmm7, -0x10(%rdx)
108 ret
109
110 .p2align 4
111 L(less_64bytes):
112 cmp $32, %dl
113 jb L(less_32bytes)
114 vmovdqu (%rsi), %xmm0
115 vmovdqu 0x10(%rsi), %xmm1
116 vmovdqu -0x20(%rsi, %rdx), %xmm6
117 vmovdqu -0x10(%rsi, %rdx), %xmm7
118 vmovdqu %xmm0, (%rdi)
119 vmovdqu %xmm1, 0x10(%rdi)
120 vmovdqu %xmm6, -0x20(%rdi, %rdx)
121 vmovdqu %xmm7, -0x10(%rdi, %rdx)
122 ret
123
124 .p2align 4
125 L(less_32bytes):
126 vmovdqu (%rsi), %xmm0
127 vmovdqu -0x10(%rsi, %rdx), %xmm7
128 vmovdqu %xmm0, (%rdi)
129 vmovdqu %xmm7, -0x10(%rdi, %rdx)
130 ret
131
132 .p2align 4
133 L(less_16bytes):
134 cmp $8, %dl
135 jb L(less_8bytes)
136 movq -0x08(%rsi, %rdx), %rcx
137 movq (%rsi), %rsi
138 movq %rsi, (%rdi)
139 movq %rcx, -0x08(%rdi, %rdx)
140 ret
141
142 .p2align 4
143 L(less_8bytes):
144 cmp $4, %dl
145 jb L(less_4bytes)
146 mov -0x04(%rsi, %rdx), %ecx
147 mov (%rsi), %esi
148 mov %esi, (%rdi)
149 mov %ecx, -0x04(%rdi, %rdx)
150 ret
151
152 L(less_4bytes):
153 cmp $1, %dl
154 jbe L(less_2bytes)
155 mov -0x02(%rsi, %rdx), %cx
156 mov (%rsi), %si
157 mov %si, (%rdi)
158 mov %cx, -0x02(%rdi, %rdx)
159 ret
160
161 L(less_2bytes):
162 jb L(less_0bytes)
163 mov (%rsi), %cl
164 mov %cl, (%rdi)
165 L(less_0bytes):
166 ret
167
168 .p2align 4
169 L(256bytesormore):
170 #ifdef USE_AS_MEMMOVE
171 mov %rdi, %rcx
172 sub %rsi, %rcx
173 cmp %rdx, %rcx
174 jc L(copy_backward)
175 #endif
176 cmp $2048, %rdx
177 jae L(gobble_data_movsb)
178 mov %rax, %r8
179 lea (%rsi, %rdx), %rcx
180 mov %rdi, %r10
181 vmovdqu -0x80(%rcx), %xmm5
182 vmovdqu -0x70(%rcx), %xmm6
183 mov $0x80, %rax
184 and $-32, %rdi
185 add $32, %rdi
186 vmovdqu -0x60(%rcx), %xmm7
187 vmovdqu -0x50(%rcx), %xmm8
188 mov %rdi, %r11
189 sub %r10, %r11
190 vmovdqu -0x40(%rcx), %xmm9
191 vmovdqu -0x30(%rcx), %xmm10
192 sub %r11, %rdx
193 vmovdqu -0x20(%rcx), %xmm11
194 vmovdqu -0x10(%rcx), %xmm12
195 vmovdqu (%rsi), %ymm4
196 add %r11, %rsi
197 sub %eax, %edx
198 L(goble_128_loop):
199 vmovdqu (%rsi), %ymm0
200 vmovdqu 0x20(%rsi), %ymm1
201 vmovdqu 0x40(%rsi), %ymm2
202 vmovdqu 0x60(%rsi), %ymm3
203 add %rax, %rsi
204 vmovdqa %ymm0, (%rdi)
205 vmovdqa %ymm1, 0x20(%rdi)
206 vmovdqa %ymm2, 0x40(%rdi)
207 vmovdqa %ymm3, 0x60(%rdi)
208 add %rax, %rdi
209 sub %eax, %edx
210 jae L(goble_128_loop)
211 add %eax, %edx
212 add %rdi, %rdx
213 vmovdqu %ymm4, (%r10)
214 vzeroupper
215 vmovdqu %xmm5, -0x80(%rdx)
216 vmovdqu %xmm6, -0x70(%rdx)
217 vmovdqu %xmm7, -0x60(%rdx)
218 vmovdqu %xmm8, -0x50(%rdx)
219 vmovdqu %xmm9, -0x40(%rdx)
220 vmovdqu %xmm10, -0x30(%rdx)
221 vmovdqu %xmm11, -0x20(%rdx)
222 vmovdqu %xmm12, -0x10(%rdx)
223 mov %r8, %rax
224 ret
225
226 .p2align 4
227 L(gobble_data_movsb):
228 #ifdef SHARED_CACHE_SIZE_HALF
229 mov $SHARED_CACHE_SIZE_HALF, %rcx
230 #else
231 mov __x86_shared_cache_size_half(%rip), %rcx
232 #endif
233 shl $3, %rcx
234 cmp %rcx, %rdx
235 jae L(gobble_big_data_fwd)
236 mov %rdx, %rcx
237 mov %rdx, %rcx
238 rep movsb
239 ret
240
241 .p2align 4
242 L(gobble_big_data_fwd):
243 lea (%rsi, %rdx), %rcx
244 vmovdqu (%rsi), %ymm4
245 vmovdqu -0x80(%rsi,%rdx), %xmm5
246 vmovdqu -0x70(%rcx), %xmm6
247 vmovdqu -0x60(%rcx), %xmm7
248 vmovdqu -0x50(%rcx), %xmm8
249 vmovdqu -0x40(%rcx), %xmm9
250 vmovdqu -0x30(%rcx), %xmm10
251 vmovdqu -0x20(%rcx), %xmm11
252 vmovdqu -0x10(%rcx), %xmm12
253 mov %rdi, %r8
254 and $-32, %rdi
255 add $32, %rdi
256 mov %rdi, %r10
257 sub %r8, %r10
258 sub %r10, %rdx
259 add %r10, %rsi
260 lea (%rdi, %rdx), %rcx
261 add $-0x80, %rdx
262 L(gobble_mem_fwd_loop):
263 prefetchnta 0x1c0(%rsi)
264 prefetchnta 0x280(%rsi)
265 vmovdqu (%rsi), %ymm0
266 vmovdqu 0x20(%rsi), %ymm1
267 vmovdqu 0x40(%rsi), %ymm2
268 vmovdqu 0x60(%rsi), %ymm3
269 sub $-0x80, %rsi
270 vmovntdq %ymm0, (%rdi)
271 vmovntdq %ymm1, 0x20(%rdi)
272 vmovntdq %ymm2, 0x40(%rdi)
273 vmovntdq %ymm3, 0x60(%rdi)
274 sub $-0x80, %rdi
275 add $-0x80, %rdx
276 jb L(gobble_mem_fwd_loop)
277 sfence
278 vmovdqu %ymm4, (%r8)
279 vzeroupper
280 vmovdqu %xmm5, -0x80(%rcx)
281 vmovdqu %xmm6, -0x70(%rcx)
282 vmovdqu %xmm7, -0x60(%rcx)
283 vmovdqu %xmm8, -0x50(%rcx)
284 vmovdqu %xmm9, -0x40(%rcx)
285 vmovdqu %xmm10, -0x30(%rcx)
286 vmovdqu %xmm11, -0x20(%rcx)
287 vmovdqu %xmm12, -0x10(%rcx)
288 ret
289
290 #ifdef USE_AS_MEMMOVE
291 .p2align 4
292 L(copy_backward):
293 #ifdef SHARED_CACHE_SIZE_HALF
294 mov $SHARED_CACHE_SIZE_HALF, %rcx
295 #else
296 mov __x86_shared_cache_size_half(%rip), %rcx
297 #endif
298 shl $3, %rcx
299 vmovdqu (%rsi), %xmm5
300 vmovdqu 0x10(%rsi), %xmm6
301 add %rdx, %rdi
302 vmovdqu 0x20(%rsi), %xmm7
303 vmovdqu 0x30(%rsi), %xmm8
304 lea -0x20(%rdi), %r10
305 mov %rdi, %r11
306 vmovdqu 0x40(%rsi), %xmm9
307 vmovdqu 0x50(%rsi), %xmm10
308 and $0x1f, %r11
309 vmovdqu 0x60(%rsi), %xmm11
310 vmovdqu 0x70(%rsi), %xmm12
311 xor %r11, %rdi
312 add %rdx, %rsi
313 vmovdqu -0x20(%rsi), %ymm4
314 sub %r11, %rsi
315 sub %r11, %rdx
316 cmp %rcx, %rdx
317 ja L(gobble_big_data_bwd)
318 add $-0x80, %rdx
319 L(gobble_mem_bwd_llc):
320 vmovdqu -0x20(%rsi), %ymm0
321 vmovdqu -0x40(%rsi), %ymm1
322 vmovdqu -0x60(%rsi), %ymm2
323 vmovdqu -0x80(%rsi), %ymm3
324 lea -0x80(%rsi), %rsi
325 vmovdqa %ymm0, -0x20(%rdi)
326 vmovdqa %ymm1, -0x40(%rdi)
327 vmovdqa %ymm2, -0x60(%rdi)
328 vmovdqa %ymm3, -0x80(%rdi)
329 lea -0x80(%rdi), %rdi
330 add $-0x80, %rdx
331 jb L(gobble_mem_bwd_llc)
332 vmovdqu %ymm4, (%r10)
333 vzeroupper
334 vmovdqu %xmm5, (%rax)
335 vmovdqu %xmm6, 0x10(%rax)
336 vmovdqu %xmm7, 0x20(%rax)
337 vmovdqu %xmm8, 0x30(%rax)
338 vmovdqu %xmm9, 0x40(%rax)
339 vmovdqu %xmm10, 0x50(%rax)
340 vmovdqu %xmm11, 0x60(%rax)
341 vmovdqu %xmm12, 0x70(%rax)
342 ret
343
344 .p2align 4
345 L(gobble_big_data_bwd):
346 add $-0x80, %rdx
347 L(gobble_mem_bwd_loop):
348 prefetchnta -0x1c0(%rsi)
349 prefetchnta -0x280(%rsi)
350 vmovdqu -0x20(%rsi), %ymm0
351 vmovdqu -0x40(%rsi), %ymm1
352 vmovdqu -0x60(%rsi), %ymm2
353 vmovdqu -0x80(%rsi), %ymm3
354 lea -0x80(%rsi), %rsi
355 vmovntdq %ymm0, -0x20(%rdi)
356 vmovntdq %ymm1, -0x40(%rdi)
357 vmovntdq %ymm2, -0x60(%rdi)
358 vmovntdq %ymm3, -0x80(%rdi)
359 lea -0x80(%rdi), %rdi
360 add $-0x80, %rdx
361 jb L(gobble_mem_bwd_loop)
362 sfence
363 vmovdqu %ymm4, (%r10)
364 vzeroupper
365 vmovdqu %xmm5, (%rax)
366 vmovdqu %xmm6, 0x10(%rax)
367 vmovdqu %xmm7, 0x20(%rax)
368 vmovdqu %xmm8, 0x30(%rax)
369 vmovdqu %xmm9, 0x40(%rax)
370 vmovdqu %xmm10, 0x50(%rax)
371 vmovdqu %xmm11, 0x60(%rax)
372 vmovdqu %xmm12, 0x70(%rax)
373 ret
374 #endif
375 END (MEMCPY)
376 #endif