tools/arch/x86/lib/memset_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* Copyright 2002 Andi Kleen, SuSE Labs */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/cpufeatures.h>
   6 #include <asm/alternative-asm.h>
   7
   8 .weak memset
   9
  10 /*
  11  * ISO C memset - set a memory block to a byte value. This function uses fast
  12  * string to get better performance than the original function. The code is
  13  * simpler and shorter than the original function as well.
  14  *
  15  * rdi   destination
  16  * rsi   value (char)
  17  * rdx   count (bytes)
  18  *
  19  * rax   original destination
  20  */
  21 ENTRY(memset)
  22 ENTRY(__memset)
  23         /*
  24          * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  25          * to use it when possible. If not available, use fast string instructions.
  26          *
  27          * Otherwise, use original memset function.
  28          */
  29         ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  30                       "jmp memset_erms", X86_FEATURE_ERMS
  31
  32         movq %rdi,%r9
  33         movq %rdx,%rcx
  34         andl $7,%edx
  35         shrq $3,%rcx
  36         /* expand byte value  */
  37         movzbl %sil,%esi
  38         movabs $0x0101010101010101,%rax
  39         imulq %rsi,%rax
  40         rep stosq
  41         movl %edx,%ecx
  42         rep stosb
  43         movq %r9,%rax
  44         ret
  45 ENDPROC(memset)
  46 ENDPROC(__memset)
  47
  48 /*
  49  * ISO C memset - set a memory block to a byte value. This function uses
  50  * enhanced rep stosb to override the fast string function.
  51  * The code is simpler and shorter than the fast string function as well.
  52  *
  53  * rdi   destination
  54  * rsi   value (char)
  55  * rdx   count (bytes)
  56  *
  57  * rax   original destination
  58  */
  59 ENTRY(memset_erms)
  60         movq %rdi,%r9
  61         movb %sil,%al
  62         movq %rdx,%rcx
  63         rep stosb
  64         movq %r9,%rax
  65         ret
  66 ENDPROC(memset_erms)
  67
  68 ENTRY(memset_orig)
  69         movq %rdi,%r10
  70
  71         /* expand byte value  */
  72         movzbl %sil,%ecx
  73         movabs $0x0101010101010101,%rax
  74         imulq  %rcx,%rax
  75
  76         /* align dst */
  77         movl  %edi,%r9d
  78         andl  $7,%r9d
  79         jnz  .Lbad_alignment
  80 .Lafter_bad_alignment:
  81
  82         movq  %rdx,%rcx
  83         shrq  $6,%rcx
  84         jz       .Lhandle_tail
  85
  86         .p2align 4
  87 .Lloop_64:
  88         decq  %rcx
  89         movq  %rax,(%rdi)
  90         movq  %rax,8(%rdi)
  91         movq  %rax,16(%rdi)
  92         movq  %rax,24(%rdi)
  93         movq  %rax,32(%rdi)
  94         movq  %rax,40(%rdi)
  95         movq  %rax,48(%rdi)
  96         movq  %rax,56(%rdi)
  97         leaq  64(%rdi),%rdi
  98         jnz    .Lloop_64
  99
 100         /* Handle tail in loops. The loops should be faster than hard
 101            to predict jump tables. */
 102         .p2align 4
 103 .Lhandle_tail:
 104         movl    %edx,%ecx
 105         andl    $63&(~7),%ecx
 106         jz              .Lhandle_7
 107         shrl    $3,%ecx
 108         .p2align 4
 109 .Lloop_8:
 110         decl   %ecx
 111         movq  %rax,(%rdi)
 112         leaq  8(%rdi),%rdi
 113         jnz    .Lloop_8
 114
 115 .Lhandle_7:
 116         andl    $7,%edx
 117         jz      .Lende
 118         .p2align 4
 119 .Lloop_1:
 120         decl    %edx
 121         movb    %al,(%rdi)
 122         leaq    1(%rdi),%rdi
 123         jnz     .Lloop_1
 124
 125 .Lende:
 126         movq    %r10,%rax
 127         ret
 128
 129 .Lbad_alignment:
 130         cmpq $7,%rdx
 131         jbe     .Lhandle_7
 132         movq %rax,(%rdi)        /* unaligned store */
 133         movq $8,%r8
 134         subq %r9,%r8
 135         addq %r8,%rdi
 136         subq %r8,%rdx
 137         jmp .Lafter_bad_alignment
 138 .Lfinal:
 139 ENDPROC(memset_orig)