]>
Commit | Line | Data |
---|---|---|
2d48b41c | 1 | /* memcpy with unaliged loads |
b168057a | 2 | Copyright (C) 2013-2015 Free Software Foundation, Inc. |
2d48b41c OB |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | #include "asm-syntax.h" | |
22 | ||
2d48b41c OB |
23 | |
24 | ENTRY(__memcpy_sse2_unaligned) | |
25 | movq %rsi, %rax | |
26 | leaq (%rdx,%rdx), %rcx | |
27 | subq %rdi, %rax | |
28 | subq %rdx, %rax | |
29 | cmpq %rcx, %rax | |
30 | jb L(overlapping) | |
31 | cmpq $16, %rdx | |
32 | jbe L(less_16) | |
33 | movdqu (%rsi), %xmm8 | |
34 | cmpq $32, %rdx | |
35 | movdqu %xmm8, (%rdi) | |
36 | movdqu -16(%rsi,%rdx), %xmm8 | |
37 | movdqu %xmm8, -16(%rdi,%rdx) | |
38 | ja .L31 | |
39 | L(return): | |
40 | movq %rdi, %rax | |
41 | ret | |
42 | .p2align 4,,10 | |
e7044ea7 | 43 | .p2align 4 |
2d48b41c OB |
44 | .L31: |
45 | movdqu 16(%rsi), %xmm8 | |
46 | cmpq $64, %rdx | |
47 | movdqu %xmm8, 16(%rdi) | |
48 | movdqu -32(%rsi,%rdx), %xmm8 | |
49 | movdqu %xmm8, -32(%rdi,%rdx) | |
50 | jbe L(return) | |
51 | movdqu 32(%rsi), %xmm8 | |
52 | cmpq $128, %rdx | |
53 | movdqu %xmm8, 32(%rdi) | |
54 | movdqu -48(%rsi,%rdx), %xmm8 | |
55 | movdqu %xmm8, -48(%rdi,%rdx) | |
56 | movdqu 48(%rsi), %xmm8 | |
57 | movdqu %xmm8, 48(%rdi) | |
58 | movdqu -64(%rsi,%rdx), %xmm8 | |
59 | movdqu %xmm8, -64(%rdi,%rdx) | |
60 | jbe L(return) | |
61 | leaq 64(%rdi), %rcx | |
62 | addq %rdi, %rdx | |
63 | andq $-64, %rdx | |
64 | andq $-64, %rcx | |
65 | movq %rcx, %rax | |
66 | subq %rdi, %rax | |
67 | addq %rax, %rsi | |
68 | cmpq %rdx, %rcx | |
69 | je L(return) | |
70 | movq %rsi, %r10 | |
71 | subq %rcx, %r10 | |
72 | leaq 16(%r10), %r9 | |
73 | leaq 32(%r10), %r8 | |
74 | leaq 48(%r10), %rax | |
75 | .p2align 4,,10 | |
e7044ea7 | 76 | .p2align 4 |
2d48b41c OB |
77 | L(loop): |
78 | movdqu (%rcx,%r10), %xmm8 | |
79 | movdqa %xmm8, (%rcx) | |
80 | movdqu (%rcx,%r9), %xmm8 | |
81 | movdqa %xmm8, 16(%rcx) | |
82 | movdqu (%rcx,%r8), %xmm8 | |
83 | movdqa %xmm8, 32(%rcx) | |
84 | movdqu (%rcx,%rax), %xmm8 | |
85 | movdqa %xmm8, 48(%rcx) | |
86 | addq $64, %rcx | |
87 | cmpq %rcx, %rdx | |
88 | jne L(loop) | |
89 | jmp L(return) | |
90 | L(overlapping): | |
91 | cmpq %rsi, %rdi | |
92 | jae .L3 | |
93 | testq %rdx, %rdx | |
94 | .p2align 4,,5 | |
95 | je L(return) | |
96 | movq %rdx, %r9 | |
97 | leaq 16(%rsi), %rcx | |
98 | leaq 16(%rdi), %r8 | |
99 | shrq $4, %r9 | |
100 | movq %r9, %rax | |
101 | salq $4, %rax | |
102 | cmpq %rcx, %rdi | |
103 | setae %cl | |
104 | cmpq %r8, %rsi | |
105 | setae %r8b | |
106 | orl %r8d, %ecx | |
107 | cmpq $15, %rdx | |
108 | seta %r8b | |
109 | testb %r8b, %cl | |
110 | je .L16 | |
111 | testq %rax, %rax | |
112 | je .L16 | |
113 | xorl %ecx, %ecx | |
114 | xorl %r8d, %r8d | |
115 | .L7: | |
116 | movdqu (%rsi,%rcx), %xmm8 | |
117 | addq $1, %r8 | |
118 | movdqu %xmm8, (%rdi,%rcx) | |
119 | addq $16, %rcx | |
120 | cmpq %r8, %r9 | |
121 | ja .L7 | |
122 | cmpq %rax, %rdx | |
123 | je L(return) | |
124 | .L21: | |
125 | movzbl (%rsi,%rax), %ecx | |
126 | movb %cl, (%rdi,%rax) | |
127 | addq $1, %rax | |
128 | cmpq %rax, %rdx | |
129 | ja .L21 | |
130 | jmp L(return) | |
131 | L(less_16): | |
132 | testb $24, %dl | |
133 | jne L(between_9_16) | |
134 | testb $4, %dl | |
135 | .p2align 4,,5 | |
136 | jne L(between_5_8) | |
137 | testq %rdx, %rdx | |
138 | .p2align 4,,2 | |
139 | je L(return) | |
140 | movzbl (%rsi), %eax | |
141 | testb $2, %dl | |
142 | movb %al, (%rdi) | |
143 | je L(return) | |
144 | movzwl -2(%rsi,%rdx), %eax | |
145 | movw %ax, -2(%rdi,%rdx) | |
146 | jmp L(return) | |
147 | .L3: | |
148 | leaq -1(%rdx), %rax | |
149 | .p2align 4,,10 | |
e7044ea7 | 150 | .p2align 4 |
2d48b41c OB |
151 | .L11: |
152 | movzbl (%rsi,%rax), %edx | |
153 | movb %dl, (%rdi,%rax) | |
154 | subq $1, %rax | |
155 | jmp .L11 | |
156 | L(between_9_16): | |
157 | movq (%rsi), %rax | |
158 | movq %rax, (%rdi) | |
159 | movq -8(%rsi,%rdx), %rax | |
160 | movq %rax, -8(%rdi,%rdx) | |
161 | jmp L(return) | |
162 | .L16: | |
163 | xorl %eax, %eax | |
164 | jmp .L21 | |
165 | L(between_5_8): | |
166 | movl (%rsi), %eax | |
167 | movl %eax, (%rdi) | |
168 | movl -4(%rsi,%rdx), %eax | |
169 | movl %eax, -4(%rdi,%rdx) | |
170 | jmp L(return) | |
171 | END(__memcpy_sse2_unaligned) |