]> git.ipfire.org Git - thirdparty/kernel/linux.git/blob - tools/arch/x86/lib/memcpy_64.S
License cleanup: add SPDX GPL-2.0 license identifier to files with no license
[thirdparty/kernel/linux.git] / tools / arch / x86 / lib / memcpy_64.S
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright 2002 Andi Kleen */
3
4 #include <linux/linkage.h>
5 #include <asm/errno.h>
6 #include <asm/cpufeatures.h>
7 #include <asm/alternative-asm.h>
8
9 /*
10 * We build a jump to memcpy_orig by default which gets NOPped out on
11 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
12 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
13 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
14 */
15
16 .weak memcpy
17
18 /*
19 * memcpy - Copy a memory block.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 *
26 * Output:
27 * rax original destination
28 */
29 ENTRY(__memcpy)
30 ENTRY(memcpy)
31 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
32 "jmp memcpy_erms", X86_FEATURE_ERMS
33
34 movq %rdi, %rax
35 movq %rdx, %rcx
36 shrq $3, %rcx
37 andl $7, %edx
38 rep movsq
39 movl %edx, %ecx
40 rep movsb
41 ret
42 ENDPROC(memcpy)
43 ENDPROC(__memcpy)
44
45 /*
46 * memcpy_erms() - enhanced fast string memcpy. This is faster and
47 * simpler than memcpy. Use memcpy_erms when possible.
48 */
49 ENTRY(memcpy_erms)
50 movq %rdi, %rax
51 movq %rdx, %rcx
52 rep movsb
53 ret
54 ENDPROC(memcpy_erms)
55
56 ENTRY(memcpy_orig)
57 movq %rdi, %rax
58
59 cmpq $0x20, %rdx
60 jb .Lhandle_tail
61
62 /*
63 * We check whether memory false dependence could occur,
64 * then jump to corresponding copy mode.
65 */
66 cmp %dil, %sil
67 jl .Lcopy_backward
68 subq $0x20, %rdx
69 .Lcopy_forward_loop:
70 subq $0x20, %rdx
71
72 /*
73 * Move in blocks of 4x8 bytes:
74 */
75 movq 0*8(%rsi), %r8
76 movq 1*8(%rsi), %r9
77 movq 2*8(%rsi), %r10
78 movq 3*8(%rsi), %r11
79 leaq 4*8(%rsi), %rsi
80
81 movq %r8, 0*8(%rdi)
82 movq %r9, 1*8(%rdi)
83 movq %r10, 2*8(%rdi)
84 movq %r11, 3*8(%rdi)
85 leaq 4*8(%rdi), %rdi
86 jae .Lcopy_forward_loop
87 addl $0x20, %edx
88 jmp .Lhandle_tail
89
90 .Lcopy_backward:
91 /*
92 * Calculate copy position to tail.
93 */
94 addq %rdx, %rsi
95 addq %rdx, %rdi
96 subq $0x20, %rdx
97 /*
98 * At most 3 ALU operations in one cycle,
99 * so append NOPS in the same 16 bytes trunk.
100 */
101 .p2align 4
102 .Lcopy_backward_loop:
103 subq $0x20, %rdx
104 movq -1*8(%rsi), %r8
105 movq -2*8(%rsi), %r9
106 movq -3*8(%rsi), %r10
107 movq -4*8(%rsi), %r11
108 leaq -4*8(%rsi), %rsi
109 movq %r8, -1*8(%rdi)
110 movq %r9, -2*8(%rdi)
111 movq %r10, -3*8(%rdi)
112 movq %r11, -4*8(%rdi)
113 leaq -4*8(%rdi), %rdi
114 jae .Lcopy_backward_loop
115
116 /*
117 * Calculate copy position to head.
118 */
119 addl $0x20, %edx
120 subq %rdx, %rsi
121 subq %rdx, %rdi
122 .Lhandle_tail:
123 cmpl $16, %edx
124 jb .Lless_16bytes
125
126 /*
127 * Move data from 16 bytes to 31 bytes.
128 */
129 movq 0*8(%rsi), %r8
130 movq 1*8(%rsi), %r9
131 movq -2*8(%rsi, %rdx), %r10
132 movq -1*8(%rsi, %rdx), %r11
133 movq %r8, 0*8(%rdi)
134 movq %r9, 1*8(%rdi)
135 movq %r10, -2*8(%rdi, %rdx)
136 movq %r11, -1*8(%rdi, %rdx)
137 retq
138 .p2align 4
139 .Lless_16bytes:
140 cmpl $8, %edx
141 jb .Lless_8bytes
142 /*
143 * Move data from 8 bytes to 15 bytes.
144 */
145 movq 0*8(%rsi), %r8
146 movq -1*8(%rsi, %rdx), %r9
147 movq %r8, 0*8(%rdi)
148 movq %r9, -1*8(%rdi, %rdx)
149 retq
150 .p2align 4
151 .Lless_8bytes:
152 cmpl $4, %edx
153 jb .Lless_3bytes
154
155 /*
156 * Move data from 4 bytes to 7 bytes.
157 */
158 movl (%rsi), %ecx
159 movl -4(%rsi, %rdx), %r8d
160 movl %ecx, (%rdi)
161 movl %r8d, -4(%rdi, %rdx)
162 retq
163 .p2align 4
164 .Lless_3bytes:
165 subl $1, %edx
166 jb .Lend
167 /*
168 * Move data from 1 bytes to 3 bytes.
169 */
170 movzbl (%rsi), %ecx
171 jz .Lstore_1byte
172 movzbq 1(%rsi), %r8
173 movzbq (%rsi, %rdx), %r9
174 movb %r8b, 1(%rdi)
175 movb %r9b, (%rdi, %rdx)
176 .Lstore_1byte:
177 movb %cl, (%rdi)
178
179 .Lend:
180 retq
181 ENDPROC(memcpy_orig)
182
183 #ifndef CONFIG_UML
184 /*
185 * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
186 * Note that we only catch machine checks when reading the source addresses.
187 * Writes to target are posted and don't generate machine checks.
188 */
189 ENTRY(memcpy_mcsafe_unrolled)
190 cmpl $8, %edx
191 /* Less than 8 bytes? Go to byte copy loop */
192 jb .L_no_whole_words
193
194 /* Check for bad alignment of source */
195 testl $7, %esi
196 /* Already aligned */
197 jz .L_8byte_aligned
198
199 /* Copy one byte at a time until source is 8-byte aligned */
200 movl %esi, %ecx
201 andl $7, %ecx
202 subl $8, %ecx
203 negl %ecx
204 subl %ecx, %edx
205 .L_copy_leading_bytes:
206 movb (%rsi), %al
207 movb %al, (%rdi)
208 incq %rsi
209 incq %rdi
210 decl %ecx
211 jnz .L_copy_leading_bytes
212
213 .L_8byte_aligned:
214 /* Figure out how many whole cache lines (64-bytes) to copy */
215 movl %edx, %ecx
216 andl $63, %edx
217 shrl $6, %ecx
218 jz .L_no_whole_cache_lines
219
220 /* Loop copying whole cache lines */
221 .L_cache_w0: movq (%rsi), %r8
222 .L_cache_w1: movq 1*8(%rsi), %r9
223 .L_cache_w2: movq 2*8(%rsi), %r10
224 .L_cache_w3: movq 3*8(%rsi), %r11
225 movq %r8, (%rdi)
226 movq %r9, 1*8(%rdi)
227 movq %r10, 2*8(%rdi)
228 movq %r11, 3*8(%rdi)
229 .L_cache_w4: movq 4*8(%rsi), %r8
230 .L_cache_w5: movq 5*8(%rsi), %r9
231 .L_cache_w6: movq 6*8(%rsi), %r10
232 .L_cache_w7: movq 7*8(%rsi), %r11
233 movq %r8, 4*8(%rdi)
234 movq %r9, 5*8(%rdi)
235 movq %r10, 6*8(%rdi)
236 movq %r11, 7*8(%rdi)
237 leaq 64(%rsi), %rsi
238 leaq 64(%rdi), %rdi
239 decl %ecx
240 jnz .L_cache_w0
241
242 /* Are there any trailing 8-byte words? */
243 .L_no_whole_cache_lines:
244 movl %edx, %ecx
245 andl $7, %edx
246 shrl $3, %ecx
247 jz .L_no_whole_words
248
249 /* Copy trailing words */
250 .L_copy_trailing_words:
251 movq (%rsi), %r8
252 mov %r8, (%rdi)
253 leaq 8(%rsi), %rsi
254 leaq 8(%rdi), %rdi
255 decl %ecx
256 jnz .L_copy_trailing_words
257
258 /* Any trailing bytes? */
259 .L_no_whole_words:
260 andl %edx, %edx
261 jz .L_done_memcpy_trap
262
263 /* Copy trailing bytes */
264 movl %edx, %ecx
265 .L_copy_trailing_bytes:
266 movb (%rsi), %al
267 movb %al, (%rdi)
268 incq %rsi
269 incq %rdi
270 decl %ecx
271 jnz .L_copy_trailing_bytes
272
273 /* Copy successful. Return zero */
274 .L_done_memcpy_trap:
275 xorq %rax, %rax
276 ret
277 ENDPROC(memcpy_mcsafe_unrolled)
278
279 .section .fixup, "ax"
280 /* Return -EFAULT for any failure */
281 .L_memcpy_mcsafe_fail:
282 mov $-EFAULT, %rax
283 ret
284
285 .previous
286
287 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
288 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
289 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
290 _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
291 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
292 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
293 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
294 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
295 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
296 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
297 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
298 #endif