Prefer https to http for gnu.org and fsf.org URLs

[thirdparty/glibc.git] / sysdeps / x86_64 / mul_1.S
diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S

index 978916b72c5d57cbff5bf97d03475259f6d029b1..117c77cca0b4d1427558cea565f8d0ae20a546eb 100644 (file)
--- a/sysdeps/x86_64/mul_1.S
+++ b/sysdeps/x86_64/mul_1.S
@@ -1,6 +1,6 @@
  /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
     the result in a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003-2019 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -14,29 +14,115 @@
     License for more details.
  
     You should have received a copy of the GNU Lesser General Public License
-   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-   MA 02111-1307, USA. */
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <https://www.gnu.org/licenses/>.  */
  
  #include <sysdep.h>
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define n_param        %rdx
+#define vl     %rcx
+
+#define n      %r11
+
         .text
  ENTRY (__mpn_mul_1)
-       movq    %rdx, %r11
-       leaq    (%rsi,%rdx,8), %rsi
-       leaq    (%rdi,%rdx,8), %rdi
-       negq    %r11
-       xorl    %r8d, %r8d
-L(loop):
-       movq    (%rsi,%r11,8), %rax
-       mulq    %rcx
-       addq    %r8, %rax
-       movl    $0, %r8d
-       adcq    %rdx, %r8
-       movq    %rax, (%rdi,%r11,8)
-       incq    %r11
-       jne     L(loop)
-       movq    %r8, %rax
+       push    %rbx
+       cfi_adjust_cfa_offset (8)
+       cfi_rel_offset (%rbx, 0)
+       xor     %r10, %r10
+       mov     (up), %rax              /* read first u limb early */
+       mov     n_param, %rbx           /* move away n from rdx, mul uses it */
+       mul     vl
+       mov     %rbx, %r11
+
+       add     %r10, %rax
+       adc     $0, %rdx
+
+       and     $3, %ebx
+       jz      L(b0)
+       cmp     $2, %ebx
+       jz      L(b2)
+       jg      L(b3)
+
+L(b1): dec     n
+       jne     L(gt1)
+       mov     %rax, (rp)
+       jmp     L(ret)
+L(gt1):        lea     8(up,n,8), up
+       lea     -8(rp,n,8), rp
+       neg     n
+       xor     %r10, %r10
+       xor     %ebx, %ebx
+       mov     %rax, %r9
+       mov     (up,n,8), %rax
+       mov     %rdx, %r8
+       jmp     L(L1)
+
+L(b0): lea     (up,n,8), up
+       lea     -16(rp,n,8), rp
+       neg     n
+       xor     %r10, %r10
+       mov     %rax, %r8
+       mov     %rdx, %rbx
+       jmp     L(L0)
+
+L(b3): lea     -8(up,n,8), up
+       lea     -24(rp,n,8), rp
+       neg     n
+       mov     %rax, %rbx
+       mov     %rdx, %r10
+       jmp     L(L3)
+
+L(b2): lea     -16(up,n,8), up
+       lea     -32(rp,n,8), rp
+       neg     n
+       xor     %r8, %r8
+       xor     %ebx, %ebx
+       mov     %rax, %r10
+       mov     24(up,n,8), %rax
+       mov     %rdx, %r9
+       jmp     L(L2)
+
+       .p2align 4
+L(top): mov    %r10, (rp,n,8)
+       add     %rax, %r9
+       mov     (up,n,8), %rax
+       adc     %rdx, %r8
+       mov     $0, %r10d
+L(L1): mul     vl
+       mov     %r9, 8(rp,n,8)
+       add     %rax, %r8
+       adc     %rdx, %rbx
+L(L0): mov     8(up,n,8), %rax
+       mul     vl
+       mov     %r8, 16(rp,n,8)
+       add     %rax, %rbx
+       adc     %rdx, %r10
+L(L3): mov     16(up,n,8), %rax
+       mul     vl
+       mov     %rbx, 24(rp,n,8)
+       mov     $0, %r8d                # zero
+       mov     %r8, %rbx               # zero
+       add     %rax, %r10
+       mov     24(up,n,8), %rax
+       mov     %r8, %r9                # zero
+       adc     %rdx, %r9
+L(L2): mul     vl
+       add     $4, n
+       js      L(top)
+
+       mov     %r10, (rp,n,8)
+       add     %rax, %r9
+       adc     %r8, %rdx
+       mov     %r9, 8(rp,n,8)
+       add     %r8, %rdx
+L(ret):        mov     %rdx, %rax
+
+       pop     %rbx
+       cfi_adjust_cfa_offset (-8)
+       cfi_restore (%rbx)
         ret
  END (__mpn_mul_1)