Improve 64bit memcpy performance for Haswell CPU with AVX instruction

author Ling Ma <ling.ml@alibaba-inc.com>

Mon, 14 Jul 2014 04:02:52 +0000 (00:02 -0400)

committer H.J. Lu <hjl.tools@gmail.com>

Wed, 30 Jul 2014 15:02:35 +0000 (08:02 -0700)
author Ling Ma <ling.ml@alibaba-inc.com>
Mon, 14 Jul 2014 04:02:52 +0000 (00:02 -0400)
committer H.J. Lu <hjl.tools@gmail.com>
Wed, 30 Jul 2014 15:02:35 +0000 (08:02 -0700)
diff --git a/ChangeLog b/ChangeLog

index 7cb9a066b0b778139a54e2af9d377f0030e4c192..77bf70da8eff8b652e8d9194b30e2486112fc556 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,23 @@
+2014-07-30  Ling Ma  <ling.ml@alibaba-inc.com>
+
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+       memmove-avx-unaligned, memcpy-avx-unaligned and
+       mempcpy-avx-unaligned.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
+       Add tests for AVX memcpy functions.
+       * sysdeps/x86_64/multiarch/memcpy.S: Add support for AVX memcpy.
+       * sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for AVX
+       memcpy_chk.
+       * sysdeps/x86_64/multiarch/memmove.c: Add support for AVX memmove.
+       * sysdeps/x86_64/multiarch/memmove_chk.c: Add support for AVX
+       memmove_chk.
+       * sysdeps/x86_64/multiarch/mempcpy.S: Add support for AVX mempcpy.
+       * sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for AVX
+       mempcpy_chk.
+       * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file.
+       * sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file.
+       * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file.
+
  2013-07-29  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
  
         [BZ #17213]
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile

index 3bb9702b959ba99bdf40bc2d16fc0245a9bcf3e0..d7002a9df3c7884ec4d00991939f2f3a596c6be9 100644 (file)
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                    memcmp-sse4 memcpy-ssse3 \
                    memcpy-sse2-unaligned mempcpy-ssse3 \
                    memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+                  memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
                    memmove-ssse3-back strcasecmp_l-ssse3 \
                    strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
                    strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c

index 7e93e598db2e9f25b17cd738f94d3d2291214b5f..78e9b20079931221e5b5d53589b9c0e841eaf46c 100644 (file)
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
    IFUNC_IMPL (i, name, __memmove_chk,
+             IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+                             __memmove_chk_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
                               __memmove_chk_ssse3_back)
               IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/memmove.S.  */
    IFUNC_IMPL (i, name, memmove,
+             IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+                             __memmove_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
                               __memmove_ssse3_back)
               IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -214,6 +218,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  #ifdef SHARED
    /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
    IFUNC_IMPL (i, name, __memcpy_chk,
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+                             __memcpy_chk_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
                               __memcpy_chk_ssse3_back)
               IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -223,6 +229,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
    IFUNC_IMPL (i, name, memcpy,
+             IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+                             __memcpy_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
                               __memcpy_ssse3_back)
               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -231,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
    IFUNC_IMPL (i, name, __mempcpy_chk,
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+                             __mempcpy_chk_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
                               __mempcpy_chk_ssse3_back)
               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -240,6 +250,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
    IFUNC_IMPL (i, name, mempcpy,
+             IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+                             __mempcpy_avx_unaligned)
               IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
                               __mempcpy_ssse3_back)
               IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S

new file mode 100644 (file)

index 0000000..3cac1e3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,376 @@
+/* memcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+       || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY        __memcpy_avx_unaligned
+# define MEMCPY_CHK    __memcpy_chk_avx_unaligned
+#endif
+
+       .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+       mov     %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+       add     %rdx, %rax
+#endif
+       cmp     $256, %rdx
+       jae     L(256bytesormore)
+       cmp     $16, %dl
+       jb      L(less_16bytes)
+       cmp     $128, %dl
+       jb      L(less_128bytes)
+       vmovdqu (%rsi), %xmm0
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu 0x20(%rsi), %xmm2
+       vmovdqu 0x30(%rsi), %xmm3
+       vmovdqu 0x40(%rsi), %xmm4
+       vmovdqu 0x50(%rsi), %xmm5
+       vmovdqu 0x60(%rsi), %xmm6
+       vmovdqu 0x70(%rsi), %xmm7
+       vmovdqu -0x80(%rcx), %xmm8
+       vmovdqu -0x70(%rcx), %xmm9
+       vmovdqu -0x60(%rcx), %xmm10
+       vmovdqu -0x50(%rcx), %xmm11
+       vmovdqu -0x40(%rcx), %xmm12
+       vmovdqu -0x30(%rcx), %xmm13
+       vmovdqu -0x20(%rcx), %xmm14
+       vmovdqu -0x10(%rcx), %xmm15
+       lea     (%rdi, %rdx), %rdx
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm2, 0x20(%rdi)
+       vmovdqu %xmm3, 0x30(%rdi)
+       vmovdqu %xmm4, 0x40(%rdi)
+       vmovdqu %xmm5, 0x50(%rdi)
+       vmovdqu %xmm6, 0x60(%rdi)
+       vmovdqu %xmm7, 0x70(%rdi)
+       vmovdqu %xmm8, -0x80(%rdx)
+       vmovdqu %xmm9, -0x70(%rdx)
+       vmovdqu %xmm10, -0x60(%rdx)
+       vmovdqu %xmm11, -0x50(%rdx)
+       vmovdqu %xmm12, -0x40(%rdx)
+       vmovdqu %xmm13, -0x30(%rdx)
+       vmovdqu %xmm14, -0x20(%rdx)
+       vmovdqu %xmm15, -0x10(%rdx)
+       ret
+       .p2align 4
+L(less_128bytes):
+       cmp     $64, %dl
+       jb      L(less_64bytes)
+       vmovdqu (%rsi), %xmm0
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu 0x20(%rsi), %xmm2
+       lea     (%rdi, %rdx), %rdx
+       vmovdqu 0x30(%rsi), %xmm3
+       vmovdqu -0x40(%rcx), %xmm4
+       vmovdqu -0x30(%rcx), %xmm5
+       vmovdqu -0x20(%rcx), %xmm6
+       vmovdqu -0x10(%rcx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm2, 0x20(%rdi)
+       vmovdqu %xmm3, 0x30(%rdi)
+       vmovdqu %xmm4, -0x40(%rdx)
+       vmovdqu %xmm5, -0x30(%rdx)
+       vmovdqu %xmm6, -0x20(%rdx)
+       vmovdqu %xmm7, -0x10(%rdx)
+       ret
+
+       .p2align 4
+L(less_64bytes):
+       cmp     $32, %dl
+       jb      L(less_32bytes)
+       vmovdqu (%rsi), %xmm0
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu -0x20(%rsi, %rdx), %xmm6
+       vmovdqu -0x10(%rsi, %rdx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm6, -0x20(%rdi, %rdx)
+       vmovdqu %xmm7, -0x10(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_32bytes):
+       vmovdqu (%rsi), %xmm0
+       vmovdqu -0x10(%rsi, %rdx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm7, -0x10(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_16bytes):
+       cmp     $8, %dl
+       jb      L(less_8bytes)
+       movq -0x08(%rsi, %rdx), %rcx
+       movq (%rsi),    %rsi
+       movq %rsi, (%rdi)
+       movq %rcx, -0x08(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_8bytes):
+       cmp     $4, %dl
+       jb      L(less_4bytes)
+       mov -0x04(%rsi, %rdx), %ecx
+       mov (%rsi),     %esi
+       mov %esi, (%rdi)
+       mov %ecx, -0x04(%rdi, %rdx)
+       ret
+
+L(less_4bytes):
+       cmp     $1, %dl
+       jbe     L(less_2bytes)
+       mov -0x02(%rsi, %rdx),  %cx
+       mov (%rsi),     %si
+       mov %si, (%rdi)
+       mov %cx, -0x02(%rdi, %rdx)
+       ret
+
+L(less_2bytes):
+       jb      L(less_0bytes)
+       mov     (%rsi), %cl
+       mov     %cl,    (%rdi)
+L(less_0bytes):
+       ret
+
+       .p2align 4
+L(256bytesormore):
+#ifdef USE_AS_MEMMOVE
+       mov     %rdi, %rcx
+       sub     %rsi, %rcx
+       cmp     %rdx, %rcx
+       jc      L(copy_backward)
+#endif
+       cmp     $2048, %rdx
+       jae     L(gobble_data_movsb)
+       mov     %rax, %r8
+       lea     (%rsi, %rdx), %rcx
+       mov     %rdi, %r10
+       vmovdqu -0x80(%rcx), %xmm5
+       vmovdqu -0x70(%rcx), %xmm6
+       mov     $0x80, %rax
+       and     $-32, %rdi
+       add     $32, %rdi
+       vmovdqu -0x60(%rcx), %xmm7
+       vmovdqu -0x50(%rcx), %xmm8
+       mov     %rdi, %r11
+       sub     %r10, %r11
+       vmovdqu -0x40(%rcx), %xmm9
+       vmovdqu -0x30(%rcx), %xmm10
+       sub     %r11, %rdx
+       vmovdqu -0x20(%rcx), %xmm11
+       vmovdqu -0x10(%rcx), %xmm12
+       vmovdqu (%rsi), %ymm4
+       add     %r11, %rsi
+       sub     %eax, %edx
+L(goble_128_loop):
+       vmovdqu (%rsi), %ymm0
+       vmovdqu 0x20(%rsi), %ymm1
+       vmovdqu 0x40(%rsi), %ymm2
+       vmovdqu 0x60(%rsi), %ymm3
+       add     %rax, %rsi
+       vmovdqa %ymm0, (%rdi)
+       vmovdqa %ymm1, 0x20(%rdi)
+       vmovdqa %ymm2, 0x40(%rdi)
+       vmovdqa %ymm3, 0x60(%rdi)
+       add     %rax, %rdi
+       sub     %eax, %edx
+       jae     L(goble_128_loop)
+       add     %eax, %edx
+       add     %rdi, %rdx
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, -0x80(%rdx)
+       vmovdqu %xmm6, -0x70(%rdx)
+       vmovdqu %xmm7, -0x60(%rdx)
+       vmovdqu %xmm8, -0x50(%rdx)
+       vmovdqu %xmm9, -0x40(%rdx)
+       vmovdqu %xmm10, -0x30(%rdx)
+       vmovdqu %xmm11, -0x20(%rdx)
+       vmovdqu %xmm12, -0x10(%rdx)
+       mov     %r8, %rax
+       ret
+
+       .p2align 4
+L(gobble_data_movsb):
+#ifdef SHARED_CACHE_SIZE_HALF
+       mov     $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+       mov     __x86_shared_cache_size_half(%rip), %rcx
+#endif
+       shl     $3, %rcx
+       cmp     %rcx, %rdx
+       jae     L(gobble_big_data_fwd)
+       mov     %rdx, %rcx
+       mov     %rdx, %rcx
+       rep     movsb
+       ret
+
+       .p2align 4
+L(gobble_big_data_fwd):
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu (%rsi), %ymm4
+       vmovdqu -0x80(%rsi,%rdx), %xmm5
+       vmovdqu -0x70(%rcx), %xmm6
+       vmovdqu -0x60(%rcx), %xmm7
+       vmovdqu -0x50(%rcx), %xmm8
+       vmovdqu -0x40(%rcx), %xmm9
+       vmovdqu -0x30(%rcx), %xmm10
+       vmovdqu -0x20(%rcx), %xmm11
+       vmovdqu -0x10(%rcx), %xmm12
+       mov     %rdi, %r8
+       and     $-32, %rdi
+       add     $32, %rdi
+       mov     %rdi, %r10
+       sub     %r8, %r10
+       sub     %r10, %rdx
+       add     %r10, %rsi
+       lea     (%rdi, %rdx), %rcx
+       add     $-0x80, %rdx
+L(gobble_mem_fwd_loop):
+       prefetchnta 0x1c0(%rsi)
+       prefetchnta 0x280(%rsi)
+       vmovdqu (%rsi), %ymm0
+       vmovdqu 0x20(%rsi), %ymm1
+       vmovdqu 0x40(%rsi), %ymm2
+       vmovdqu 0x60(%rsi), %ymm3
+       sub     $-0x80, %rsi
+       vmovntdq        %ymm0, (%rdi)
+       vmovntdq        %ymm1, 0x20(%rdi)
+       vmovntdq        %ymm2, 0x40(%rdi)
+       vmovntdq        %ymm3, 0x60(%rdi)
+       sub     $-0x80, %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_fwd_loop)
+       sfence
+       vmovdqu %ymm4, (%r8)
+       vzeroupper
+       vmovdqu %xmm5, -0x80(%rcx)
+       vmovdqu %xmm6, -0x70(%rcx)
+       vmovdqu %xmm7, -0x60(%rcx)
+       vmovdqu %xmm8, -0x50(%rcx)
+       vmovdqu %xmm9, -0x40(%rcx)
+       vmovdqu %xmm10, -0x30(%rcx)
+       vmovdqu %xmm11, -0x20(%rcx)
+       vmovdqu %xmm12, -0x10(%rcx)
+       ret
+
+#ifdef USE_AS_MEMMOVE
+       .p2align 4
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+       mov     $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+       mov     __x86_shared_cache_size_half(%rip), %rcx
+#endif
+       shl     $3, %rcx
+       vmovdqu (%rsi), %xmm5
+       vmovdqu 0x10(%rsi), %xmm6
+       add     %rdx, %rdi
+       vmovdqu 0x20(%rsi), %xmm7
+       vmovdqu 0x30(%rsi), %xmm8
+       lea     -0x20(%rdi), %r10
+       mov %rdi, %r11
+       vmovdqu 0x40(%rsi), %xmm9
+       vmovdqu 0x50(%rsi), %xmm10
+       and     $0x1f, %r11
+       vmovdqu 0x60(%rsi), %xmm11
+       vmovdqu 0x70(%rsi), %xmm12
+       xor     %r11, %rdi
+       add     %rdx, %rsi
+       vmovdqu -0x20(%rsi), %ymm4
+       sub     %r11, %rsi
+       sub     %r11, %rdx
+       cmp     %rcx, %rdx
+       ja      L(gobble_big_data_bwd)
+       add     $-0x80, %rdx
+L(gobble_mem_bwd_llc):
+       vmovdqu -0x20(%rsi), %ymm0
+       vmovdqu -0x40(%rsi), %ymm1
+       vmovdqu -0x60(%rsi), %ymm2
+       vmovdqu -0x80(%rsi), %ymm3
+       lea     -0x80(%rsi), %rsi
+       vmovdqa %ymm0, -0x20(%rdi)
+       vmovdqa %ymm1, -0x40(%rdi)
+       vmovdqa %ymm2, -0x60(%rdi)
+       vmovdqa %ymm3, -0x80(%rdi)
+       lea     -0x80(%rdi), %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_bwd_llc)
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, (%rax)
+       vmovdqu %xmm6, 0x10(%rax)
+       vmovdqu %xmm7, 0x20(%rax)
+       vmovdqu %xmm8, 0x30(%rax)
+       vmovdqu %xmm9, 0x40(%rax)
+       vmovdqu %xmm10, 0x50(%rax)
+       vmovdqu %xmm11, 0x60(%rax)
+       vmovdqu %xmm12, 0x70(%rax)
+       ret
+
+       .p2align 4
+L(gobble_big_data_bwd):
+       add     $-0x80, %rdx
+L(gobble_mem_bwd_loop):
+       prefetchnta -0x1c0(%rsi)
+       prefetchnta -0x280(%rsi)
+       vmovdqu -0x20(%rsi), %ymm0
+       vmovdqu -0x40(%rsi), %ymm1
+       vmovdqu -0x60(%rsi), %ymm2
+       vmovdqu -0x80(%rsi), %ymm3
+       lea     -0x80(%rsi), %rsi
+       vmovntdq        %ymm0, -0x20(%rdi)
+       vmovntdq        %ymm1, -0x40(%rdi)
+       vmovntdq        %ymm2, -0x60(%rdi)
+       vmovntdq        %ymm3, -0x80(%rdi)
+       lea     -0x80(%rdi), %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_bwd_loop)
+       sfence
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, (%rax)
+       vmovdqu %xmm6, 0x10(%rax)
+       vmovdqu %xmm7, 0x20(%rax)
+       vmovdqu %xmm8, 0x30(%rax)
+       vmovdqu %xmm9, 0x40(%rax)
+       vmovdqu %xmm10, 0x50(%rax)
+       vmovdqu %xmm11, 0x60(%rax)
+       vmovdqu %xmm12, 0x70(%rax)
+       ret
+#endif
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S

index 40ae92638642a9d52a58e157fc394bb53994d332..e6666954075f924f9f3a19e199e02b1310eb416b 100644 (file)
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
         jne     1f
         call    __init_cpu_features
+1:     leaq    __memcpy_avx_unaligned(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz 1f
+       ret
  1:     leaq    __memcpy_sse2(%rip), %rax
         testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
         jnz     2f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S

index 3c0270fd2358661fd0b1131f7e4740832fe9fabe..076b19a9eac5b343d7be1a0e19e0e15ff6c48b7a 100644 (file)
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
         jz      2f
         leaq    __memcpy_chk_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz  2f
+       leaq    __memcpy_chk_avx_unaligned(%rip), %rax
  2:     ret
  END(__memcpy_chk)
  # else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S

new file mode 100644 (file)

index 0000000..faed9fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,22 @@
+/* memmove with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY         __memmove_avx_unaligned
+#define MEMCPY_CHK     __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c

index ba86e7bbb1cf50c85938f2cac5e4d0d17536e848..0c9af7e4dfeec1973f212a67ac7593d484677f26 100644 (file)
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -35,6 +35,8 @@
  extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+
  #endif
  
  #include "string/memmove.c"
@@ -47,10 +49,12 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
     ifunc symbol properly.  */
  extern __typeof (__redirect_memmove) __libc_memmove;
  libc_ifunc (__libc_memmove,
-           HAS_SSSE3
-           ? (HAS_FAST_COPY_BACKWARD
-              ? __memmove_ssse3_back : __memmove_ssse3)
-           : __memmove_sse2)
+           HAS_AVX
+           ? __memmove_avx_unaligned
+           : (HAS_SSSE3
+              ? (HAS_FAST_COPY_BACKWARD
+                 ? __memmove_ssse3_back : __memmove_ssse3)
+              : __memmove_sse2));
  
  strong_alias (__libc_memmove, memmove)
  
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c

index cb1acb65982399cd046a6dfefcf471844ec298f9..44344f2820c52c234672975326c3e659e719ec62 100644 (file)
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -25,11 +25,13 @@
  extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
  extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
  extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
  
  #include "debug/memmove_chk.c"
  
  libc_ifunc (__memmove_chk,
-           HAS_SSSE3
+           HAS_AVX ? __memmove_chk_avx_unaligned :
+           (HAS_SSSE3
             ? (HAS_FAST_COPY_BACKWARD
                ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-           : __memmove_chk_sse2);
+           : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S

new file mode 100644 (file)

index 0000000..438bda3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,22 @@
+/* mempcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY         __mempcpy_avx_unaligned
+#define MEMCPY_CHK     __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S

index b9f04c2ec4d73fa6aa5f399ba5dd2c6f08a3fe1c..7589d8c1ec6b92d8d1388a00a745e59c3dfd55c4 100644 (file)
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -37,6 +37,9 @@ ENTRY(__mempcpy)
         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
         jz      2f
         leaq    __mempcpy_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz      2f
+       leaq    __mempcpy_avx_unaligned(%rip), %rax
  2:     ret
  END(__mempcpy)
  
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S

index c28473a6694cdf9b811b06a9da90f8d08111155f..88e0b74e83738d89b1508bd299d9926fe3c35a89 100644 (file)
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
         jz      2f
         leaq    __mempcpy_chk_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz      2f
+       leaq    __mempcpy_chk_avx_unaligned(%rip), %rax
  2:     ret
  END(__mempcpy_chk)
  # else
author	Ling Ma <ling.ml@alibaba-inc.com>
	Mon, 14 Jul 2014 04:02:52 +0000 (00:02 -0400)
committer	H.J. Lu <hjl.tools@gmail.com>
	Wed, 30 Jul 2014 15:02:35 +0000 (08:02 -0700)
ChangeLog		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/Makefile		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/ifunc-impl-list.c		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memcpy.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memcpy_chk.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memmove.c		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memmove_chk.c		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/mempcpy.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/mempcpy_chk.S		patch \| blob \| blame \| history