]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcpy-sse2-unaligned.S
1 /* memcpy with unaliged loads
2 Copyright (C) 2013-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #include "asm-syntax.h"
22
23
24 ENTRY(__memcpy_sse2_unaligned)
25 movq %rsi, %rax
26 leaq (%rdx,%rdx), %rcx
27 subq %rdi, %rax
28 subq %rdx, %rax
29 cmpq %rcx, %rax
30 jb L(overlapping)
31 cmpq $16, %rdx
32 jbe L(less_16)
33 movdqu (%rsi), %xmm8
34 cmpq $32, %rdx
35 movdqu %xmm8, (%rdi)
36 movdqu -16(%rsi,%rdx), %xmm8
37 movdqu %xmm8, -16(%rdi,%rdx)
38 ja .L31
39 L(return):
40 movq %rdi, %rax
41 ret
42 .p2align 4,,10
43 .p2align 4
44 .L31:
45 movdqu 16(%rsi), %xmm8
46 cmpq $64, %rdx
47 movdqu %xmm8, 16(%rdi)
48 movdqu -32(%rsi,%rdx), %xmm8
49 movdqu %xmm8, -32(%rdi,%rdx)
50 jbe L(return)
51 movdqu 32(%rsi), %xmm8
52 cmpq $128, %rdx
53 movdqu %xmm8, 32(%rdi)
54 movdqu -48(%rsi,%rdx), %xmm8
55 movdqu %xmm8, -48(%rdi,%rdx)
56 movdqu 48(%rsi), %xmm8
57 movdqu %xmm8, 48(%rdi)
58 movdqu -64(%rsi,%rdx), %xmm8
59 movdqu %xmm8, -64(%rdi,%rdx)
60 jbe L(return)
61 leaq 64(%rdi), %rcx
62 addq %rdi, %rdx
63 andq $-64, %rdx
64 andq $-64, %rcx
65 movq %rcx, %rax
66 subq %rdi, %rax
67 addq %rax, %rsi
68 cmpq %rdx, %rcx
69 je L(return)
70 movq %rsi, %r10
71 subq %rcx, %r10
72 leaq 16(%r10), %r9
73 leaq 32(%r10), %r8
74 leaq 48(%r10), %rax
75 .p2align 4,,10
76 .p2align 4
77 L(loop):
78 movdqu (%rcx,%r10), %xmm8
79 movdqa %xmm8, (%rcx)
80 movdqu (%rcx,%r9), %xmm8
81 movdqa %xmm8, 16(%rcx)
82 movdqu (%rcx,%r8), %xmm8
83 movdqa %xmm8, 32(%rcx)
84 movdqu (%rcx,%rax), %xmm8
85 movdqa %xmm8, 48(%rcx)
86 addq $64, %rcx
87 cmpq %rcx, %rdx
88 jne L(loop)
89 jmp L(return)
90 L(overlapping):
91 cmpq %rsi, %rdi
92 jae .L3
93 testq %rdx, %rdx
94 .p2align 4,,5
95 je L(return)
96 movq %rdx, %r9
97 leaq 16(%rsi), %rcx
98 leaq 16(%rdi), %r8
99 shrq $4, %r9
100 movq %r9, %rax
101 salq $4, %rax
102 cmpq %rcx, %rdi
103 setae %cl
104 cmpq %r8, %rsi
105 setae %r8b
106 orl %r8d, %ecx
107 cmpq $15, %rdx
108 seta %r8b
109 testb %r8b, %cl
110 je .L16
111 testq %rax, %rax
112 je .L16
113 xorl %ecx, %ecx
114 xorl %r8d, %r8d
115 .L7:
116 movdqu (%rsi,%rcx), %xmm8
117 addq $1, %r8
118 movdqu %xmm8, (%rdi,%rcx)
119 addq $16, %rcx
120 cmpq %r8, %r9
121 ja .L7
122 cmpq %rax, %rdx
123 je L(return)
124 .L21:
125 movzbl (%rsi,%rax), %ecx
126 movb %cl, (%rdi,%rax)
127 addq $1, %rax
128 cmpq %rax, %rdx
129 ja .L21
130 jmp L(return)
131 L(less_16):
132 testb $24, %dl
133 jne L(between_9_16)
134 testb $4, %dl
135 .p2align 4,,5
136 jne L(between_5_8)
137 testq %rdx, %rdx
138 .p2align 4,,2
139 je L(return)
140 movzbl (%rsi), %eax
141 testb $2, %dl
142 movb %al, (%rdi)
143 je L(return)
144 movzwl -2(%rsi,%rdx), %eax
145 movw %ax, -2(%rdi,%rdx)
146 jmp L(return)
147 .L3:
148 leaq -1(%rdx), %rax
149 .p2align 4,,10
150 .p2align 4
151 .L11:
152 movzbl (%rsi,%rax), %edx
153 movb %dl, (%rdi,%rax)
154 subq $1, %rax
155 jmp .L11
156 L(between_9_16):
157 movq (%rsi), %rax
158 movq %rax, (%rdi)
159 movq -8(%rsi,%rdx), %rax
160 movq %rax, -8(%rdi,%rdx)
161 jmp L(return)
162 .L16:
163 xorl %eax, %eax
164 jmp .L21
165 L(between_5_8):
166 movl (%rsi), %eax
167 movl %eax, (%rdi)
168 movl -4(%rsi,%rdx), %eax
169 movl %eax, -4(%rdi,%rdx)
170 jmp L(return)
171 END(__memcpy_sse2_unaligned)