tools/arch/x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/errno.h>
   6 #include <asm/cpufeatures.h>
   7 #include <asm/alternative-asm.h>
   8
   9 /*
  10  * We build a jump to memcpy_orig by default which gets NOPped out on
  11  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  12  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  13  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  14  */
  15
  16 .weak memcpy
  17
  18 /*
  19  * memcpy - Copy a memory block.
  20  *
  21  * Input:
  22  *  rdi destination
  23  *  rsi source
  24  *  rdx count
  25  *
  26  * Output:
  27  * rax original destination
  28  */
  29 ENTRY(__memcpy)
  30 ENTRY(memcpy)
  31         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  32                       "jmp memcpy_erms", X86_FEATURE_ERMS
  33
  34         movq %rdi, %rax
  35         movq %rdx, %rcx
  36         shrq $3, %rcx
  37         andl $7, %edx
  38         rep movsq
  39         movl %edx, %ecx
  40         rep movsb
  41         ret
  42 ENDPROC(memcpy)
  43 ENDPROC(__memcpy)
  44
  45 /*
  46  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  47  * simpler than memcpy. Use memcpy_erms when possible.
  48  */
  49 ENTRY(memcpy_erms)
  50         movq %rdi, %rax
  51         movq %rdx, %rcx
  52         rep movsb
  53         ret
  54 ENDPROC(memcpy_erms)
  55
  56 ENTRY(memcpy_orig)
  57         movq %rdi, %rax
  58
  59         cmpq $0x20, %rdx
  60         jb .Lhandle_tail
  61
  62         /*
  63          * We check whether memory false dependence could occur,
  64          * then jump to corresponding copy mode.
  65          */
  66         cmp  %dil, %sil
  67         jl .Lcopy_backward
  68         subq $0x20, %rdx
  69 .Lcopy_forward_loop:
  70         subq $0x20,     %rdx
  71
  72         /*
  73          * Move in blocks of 4x8 bytes:
  74          */
  75         movq 0*8(%rsi), %r8
  76         movq 1*8(%rsi), %r9
  77         movq 2*8(%rsi), %r10
  78         movq 3*8(%rsi), %r11
  79         leaq 4*8(%rsi), %rsi
  80
  81         movq %r8,       0*8(%rdi)
  82         movq %r9,       1*8(%rdi)
  83         movq %r10,      2*8(%rdi)
  84         movq %r11,      3*8(%rdi)
  85         leaq 4*8(%rdi), %rdi
  86         jae  .Lcopy_forward_loop
  87         addl $0x20,     %edx
  88         jmp  .Lhandle_tail
  89
  90 .Lcopy_backward:
  91         /*
  92          * Calculate copy position to tail.
  93          */
  94         addq %rdx,      %rsi
  95         addq %rdx,      %rdi
  96         subq $0x20,     %rdx
  97         /*
  98          * At most 3 ALU operations in one cycle,
  99          * so append NOPS in the same 16 bytes trunk.
 100          */
 101         .p2align 4
 102 .Lcopy_backward_loop:
 103         subq $0x20,     %rdx
 104         movq -1*8(%rsi),        %r8
 105         movq -2*8(%rsi),        %r9
 106         movq -3*8(%rsi),        %r10
 107         movq -4*8(%rsi),        %r11
 108         leaq -4*8(%rsi),        %rsi
 109         movq %r8,               -1*8(%rdi)
 110         movq %r9,               -2*8(%rdi)
 111         movq %r10,              -3*8(%rdi)
 112         movq %r11,              -4*8(%rdi)
 113         leaq -4*8(%rdi),        %rdi
 114         jae  .Lcopy_backward_loop
 115
 116         /*
 117          * Calculate copy position to head.
 118          */
 119         addl $0x20,     %edx
 120         subq %rdx,      %rsi
 121         subq %rdx,      %rdi
 122 .Lhandle_tail:
 123         cmpl $16,       %edx
 124         jb   .Lless_16bytes
 125
 126         /*
 127          * Move data from 16 bytes to 31 bytes.
 128          */
 129         movq 0*8(%rsi), %r8
 130         movq 1*8(%rsi), %r9
 131         movq -2*8(%rsi, %rdx),  %r10
 132         movq -1*8(%rsi, %rdx),  %r11
 133         movq %r8,       0*8(%rdi)
 134         movq %r9,       1*8(%rdi)
 135         movq %r10,      -2*8(%rdi, %rdx)
 136         movq %r11,      -1*8(%rdi, %rdx)
 137         retq
 138         .p2align 4
 139 .Lless_16bytes:
 140         cmpl $8,        %edx
 141         jb   .Lless_8bytes
 142         /*
 143          * Move data from 8 bytes to 15 bytes.
 144          */
 145         movq 0*8(%rsi), %r8
 146         movq -1*8(%rsi, %rdx),  %r9
 147         movq %r8,       0*8(%rdi)
 148         movq %r9,       -1*8(%rdi, %rdx)
 149         retq
 150         .p2align 4
 151 .Lless_8bytes:
 152         cmpl $4,        %edx
 153         jb   .Lless_3bytes
 154
 155         /*
 156          * Move data from 4 bytes to 7 bytes.
 157          */
 158         movl (%rsi), %ecx
 159         movl -4(%rsi, %rdx), %r8d
 160         movl %ecx, (%rdi)
 161         movl %r8d, -4(%rdi, %rdx)
 162         retq
 163         .p2align 4
 164 .Lless_3bytes:
 165         subl $1, %edx
 166         jb .Lend
 167         /*
 168          * Move data from 1 bytes to 3 bytes.
 169          */
 170         movzbl (%rsi), %ecx
 171         jz .Lstore_1byte
 172         movzbq 1(%rsi), %r8
 173         movzbq (%rsi, %rdx), %r9
 174         movb %r8b, 1(%rdi)
 175         movb %r9b, (%rdi, %rdx)
 176 .Lstore_1byte:
 177         movb %cl, (%rdi)
 178
 179 .Lend:
 180         retq
 181 ENDPROC(memcpy_orig)
 182
 183 #ifndef CONFIG_UML
 184 /*
 185  * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
 186  * Note that we only catch machine checks when reading the source addresses.
 187  * Writes to target are posted and don't generate machine checks.
 188  */
 189 ENTRY(memcpy_mcsafe_unrolled)
 190         cmpl $8, %edx
 191         /* Less than 8 bytes? Go to byte copy loop */
 192         jb .L_no_whole_words
 193
 194         /* Check for bad alignment of source */
 195         testl $7, %esi
 196         /* Already aligned */
 197         jz .L_8byte_aligned
 198
 199         /* Copy one byte at a time until source is 8-byte aligned */
 200         movl %esi, %ecx
 201         andl $7, %ecx
 202         subl $8, %ecx
 203         negl %ecx
 204         subl %ecx, %edx
 205 .L_copy_leading_bytes:
 206         movb (%rsi), %al
 207         movb %al, (%rdi)
 208         incq %rsi
 209         incq %rdi
 210         decl %ecx
 211         jnz .L_copy_leading_bytes
 212
 213 .L_8byte_aligned:
 214         /* Figure out how many whole cache lines (64-bytes) to copy */
 215         movl %edx, %ecx
 216         andl $63, %edx
 217         shrl $6, %ecx
 218         jz .L_no_whole_cache_lines
 219
 220         /* Loop copying whole cache lines */
 221 .L_cache_w0: movq (%rsi), %r8
 222 .L_cache_w1: movq 1*8(%rsi), %r9
 223 .L_cache_w2: movq 2*8(%rsi), %r10
 224 .L_cache_w3: movq 3*8(%rsi), %r11
 225         movq %r8, (%rdi)
 226         movq %r9, 1*8(%rdi)
 227         movq %r10, 2*8(%rdi)
 228         movq %r11, 3*8(%rdi)
 229 .L_cache_w4: movq 4*8(%rsi), %r8
 230 .L_cache_w5: movq 5*8(%rsi), %r9
 231 .L_cache_w6: movq 6*8(%rsi), %r10
 232 .L_cache_w7: movq 7*8(%rsi), %r11
 233         movq %r8, 4*8(%rdi)
 234         movq %r9, 5*8(%rdi)
 235         movq %r10, 6*8(%rdi)
 236         movq %r11, 7*8(%rdi)
 237         leaq 64(%rsi), %rsi
 238         leaq 64(%rdi), %rdi
 239         decl %ecx
 240         jnz .L_cache_w0
 241
 242         /* Are there any trailing 8-byte words? */
 243 .L_no_whole_cache_lines:
 244         movl %edx, %ecx
 245         andl $7, %edx
 246         shrl $3, %ecx
 247         jz .L_no_whole_words
 248
 249         /* Copy trailing words */
 250 .L_copy_trailing_words:
 251         movq (%rsi), %r8
 252         mov %r8, (%rdi)
 253         leaq 8(%rsi), %rsi
 254         leaq 8(%rdi), %rdi
 255         decl %ecx
 256         jnz .L_copy_trailing_words
 257
 258         /* Any trailing bytes? */
 259 .L_no_whole_words:
 260         andl %edx, %edx
 261         jz .L_done_memcpy_trap
 262
 263         /* Copy trailing bytes */
 264         movl %edx, %ecx
 265 .L_copy_trailing_bytes:
 266         movb (%rsi), %al
 267         movb %al, (%rdi)
 268         incq %rsi
 269         incq %rdi
 270         decl %ecx
 271         jnz .L_copy_trailing_bytes
 272
 273         /* Copy successful. Return zero */
 274 .L_done_memcpy_trap:
 275         xorq %rax, %rax
 276         ret
 277 ENDPROC(memcpy_mcsafe_unrolled)
 278
 279         .section .fixup, "ax"
 280         /* Return -EFAULT for any failure */
 281 .L_memcpy_mcsafe_fail:
 282         mov     $-EFAULT, %rax
 283         ret
 284
 285         .previous
 286
 287         _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
 288         _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
 289         _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
 290         _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
 291         _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
 292         _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
 293         _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
 294         _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
 295         _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
 296         _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
 297         _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
 298 #endif