]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
Improve 64bit memcpy performance for Haswell CPU with AVX instruction
authorLing Ma <ling.ml@alibaba-inc.com>
Mon, 14 Jul 2014 04:02:52 +0000 (00:02 -0400)
committerH.J. Lu <hjl.tools@gmail.com>
Wed, 30 Jul 2014 15:02:35 +0000 (08:02 -0700)
In this patch we take advantage of HSW memory bandwidth, manage to
reduce miss branch prediction by avoiding using branch instructions and
force destination to be aligned with avx instruction.

The CPU2006 403.gcc benchmark indicates this patch improves performance
from 2% to 10%.

12 files changed:
ChangeLog
sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memcpy.S
sysdeps/x86_64/multiarch/memcpy_chk.S
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove.c
sysdeps/x86_64/multiarch/memmove_chk.c
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/mempcpy.S
sysdeps/x86_64/multiarch/mempcpy_chk.S

index 7cb9a066b0b778139a54e2af9d377f0030e4c192..77bf70da8eff8b652e8d9194b30e2486112fc556 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,23 @@
+2014-07-30  Ling Ma  <ling.ml@alibaba-inc.com>
+
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+       memmove-avx-unaligned, memcpy-avx-unaligned and
+       mempcpy-avx-unaligned.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
+       Add tests for AVX memcpy functions.
+       * sysdeps/x86_64/multiarch/memcpy.S: Add support for AVX memcpy.
+       * sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for AVX
+       memcpy_chk.
+       * sysdeps/x86_64/multiarch/memmove.c: Add support for AVX memmove.
+       * sysdeps/x86_64/multiarch/memmove_chk.c: Add support for AVX
+       memmove_chk.
+       * sysdeps/x86_64/multiarch/mempcpy.S: Add support for AVX mempcpy.
+       * sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for AVX
+       mempcpy_chk.
+       * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file.
+       * sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file.
+       * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file.
+
 2013-07-29  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
        [BZ #17213]
index 3bb9702b959ba99bdf40bc2d16fc0245a9bcf3e0..d7002a9df3c7884ec4d00991939f2f3a596c6be9 100644 (file)
@@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                   memcmp-sse4 memcpy-ssse3 \
                   memcpy-sse2-unaligned mempcpy-ssse3 \
                   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+                  memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
                   memmove-ssse3-back strcasecmp_l-ssse3 \
                   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
                   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
index 7e93e598db2e9f25b17cd738f94d3d2291214b5f..78e9b20079931221e5b5d53589b9c0e841eaf46c 100644 (file)
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
+             IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+                             __memmove_chk_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
                              __memmove_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
+             IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+                             __memmove_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
                              __memmove_ssse3_back)
              IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -214,6 +218,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+                             __memcpy_chk_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
                              __memcpy_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -223,6 +229,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
+             IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+                             __memcpy_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
                              __memcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -231,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+                             __mempcpy_chk_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
                              __mempcpy_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -240,6 +250,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
+             IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+                             __mempcpy_avx_unaligned)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
                              __mempcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644 (file)
index 0000000..3cac1e3
--- /dev/null
@@ -0,0 +1,376 @@
+/* memcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+       || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY        __memcpy_avx_unaligned
+# define MEMCPY_CHK    __memcpy_chk_avx_unaligned
+#endif
+
+       .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+       mov     %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+       add     %rdx, %rax
+#endif
+       cmp     $256, %rdx
+       jae     L(256bytesormore)
+       cmp     $16, %dl
+       jb      L(less_16bytes)
+       cmp     $128, %dl
+       jb      L(less_128bytes)
+       vmovdqu (%rsi), %xmm0
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu 0x20(%rsi), %xmm2
+       vmovdqu 0x30(%rsi), %xmm3
+       vmovdqu 0x40(%rsi), %xmm4
+       vmovdqu 0x50(%rsi), %xmm5
+       vmovdqu 0x60(%rsi), %xmm6
+       vmovdqu 0x70(%rsi), %xmm7
+       vmovdqu -0x80(%rcx), %xmm8
+       vmovdqu -0x70(%rcx), %xmm9
+       vmovdqu -0x60(%rcx), %xmm10
+       vmovdqu -0x50(%rcx), %xmm11
+       vmovdqu -0x40(%rcx), %xmm12
+       vmovdqu -0x30(%rcx), %xmm13
+       vmovdqu -0x20(%rcx), %xmm14
+       vmovdqu -0x10(%rcx), %xmm15
+       lea     (%rdi, %rdx), %rdx
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm2, 0x20(%rdi)
+       vmovdqu %xmm3, 0x30(%rdi)
+       vmovdqu %xmm4, 0x40(%rdi)
+       vmovdqu %xmm5, 0x50(%rdi)
+       vmovdqu %xmm6, 0x60(%rdi)
+       vmovdqu %xmm7, 0x70(%rdi)
+       vmovdqu %xmm8, -0x80(%rdx)
+       vmovdqu %xmm9, -0x70(%rdx)
+       vmovdqu %xmm10, -0x60(%rdx)
+       vmovdqu %xmm11, -0x50(%rdx)
+       vmovdqu %xmm12, -0x40(%rdx)
+       vmovdqu %xmm13, -0x30(%rdx)
+       vmovdqu %xmm14, -0x20(%rdx)
+       vmovdqu %xmm15, -0x10(%rdx)
+       ret
+       .p2align 4
+L(less_128bytes):
+       cmp     $64, %dl
+       jb      L(less_64bytes)
+       vmovdqu (%rsi), %xmm0
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu 0x20(%rsi), %xmm2
+       lea     (%rdi, %rdx), %rdx
+       vmovdqu 0x30(%rsi), %xmm3
+       vmovdqu -0x40(%rcx), %xmm4
+       vmovdqu -0x30(%rcx), %xmm5
+       vmovdqu -0x20(%rcx), %xmm6
+       vmovdqu -0x10(%rcx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm2, 0x20(%rdi)
+       vmovdqu %xmm3, 0x30(%rdi)
+       vmovdqu %xmm4, -0x40(%rdx)
+       vmovdqu %xmm5, -0x30(%rdx)
+       vmovdqu %xmm6, -0x20(%rdx)
+       vmovdqu %xmm7, -0x10(%rdx)
+       ret
+
+       .p2align 4
+L(less_64bytes):
+       cmp     $32, %dl
+       jb      L(less_32bytes)
+       vmovdqu (%rsi), %xmm0
+       vmovdqu 0x10(%rsi), %xmm1
+       vmovdqu -0x20(%rsi, %rdx), %xmm6
+       vmovdqu -0x10(%rsi, %rdx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, 0x10(%rdi)
+       vmovdqu %xmm6, -0x20(%rdi, %rdx)
+       vmovdqu %xmm7, -0x10(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_32bytes):
+       vmovdqu (%rsi), %xmm0
+       vmovdqu -0x10(%rsi, %rdx), %xmm7
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm7, -0x10(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_16bytes):
+       cmp     $8, %dl
+       jb      L(less_8bytes)
+       movq -0x08(%rsi, %rdx), %rcx
+       movq (%rsi),    %rsi
+       movq %rsi, (%rdi)
+       movq %rcx, -0x08(%rdi, %rdx)
+       ret
+
+       .p2align 4
+L(less_8bytes):
+       cmp     $4, %dl
+       jb      L(less_4bytes)
+       mov -0x04(%rsi, %rdx), %ecx
+       mov (%rsi),     %esi
+       mov %esi, (%rdi)
+       mov %ecx, -0x04(%rdi, %rdx)
+       ret
+
+L(less_4bytes):
+       cmp     $1, %dl
+       jbe     L(less_2bytes)
+       mov -0x02(%rsi, %rdx),  %cx
+       mov (%rsi),     %si
+       mov %si, (%rdi)
+       mov %cx, -0x02(%rdi, %rdx)
+       ret
+
+L(less_2bytes):
+       jb      L(less_0bytes)
+       mov     (%rsi), %cl
+       mov     %cl,    (%rdi)
+L(less_0bytes):
+       ret
+
+       .p2align 4
+L(256bytesormore):
+#ifdef USE_AS_MEMMOVE
+       mov     %rdi, %rcx
+       sub     %rsi, %rcx
+       cmp     %rdx, %rcx
+       jc      L(copy_backward)
+#endif
+       cmp     $2048, %rdx
+       jae     L(gobble_data_movsb)
+       mov     %rax, %r8
+       lea     (%rsi, %rdx), %rcx
+       mov     %rdi, %r10
+       vmovdqu -0x80(%rcx), %xmm5
+       vmovdqu -0x70(%rcx), %xmm6
+       mov     $0x80, %rax
+       and     $-32, %rdi
+       add     $32, %rdi
+       vmovdqu -0x60(%rcx), %xmm7
+       vmovdqu -0x50(%rcx), %xmm8
+       mov     %rdi, %r11
+       sub     %r10, %r11
+       vmovdqu -0x40(%rcx), %xmm9
+       vmovdqu -0x30(%rcx), %xmm10
+       sub     %r11, %rdx
+       vmovdqu -0x20(%rcx), %xmm11
+       vmovdqu -0x10(%rcx), %xmm12
+       vmovdqu (%rsi), %ymm4
+       add     %r11, %rsi
+       sub     %eax, %edx
+L(goble_128_loop):
+       vmovdqu (%rsi), %ymm0
+       vmovdqu 0x20(%rsi), %ymm1
+       vmovdqu 0x40(%rsi), %ymm2
+       vmovdqu 0x60(%rsi), %ymm3
+       add     %rax, %rsi
+       vmovdqa %ymm0, (%rdi)
+       vmovdqa %ymm1, 0x20(%rdi)
+       vmovdqa %ymm2, 0x40(%rdi)
+       vmovdqa %ymm3, 0x60(%rdi)
+       add     %rax, %rdi
+       sub     %eax, %edx
+       jae     L(goble_128_loop)
+       add     %eax, %edx
+       add     %rdi, %rdx
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, -0x80(%rdx)
+       vmovdqu %xmm6, -0x70(%rdx)
+       vmovdqu %xmm7, -0x60(%rdx)
+       vmovdqu %xmm8, -0x50(%rdx)
+       vmovdqu %xmm9, -0x40(%rdx)
+       vmovdqu %xmm10, -0x30(%rdx)
+       vmovdqu %xmm11, -0x20(%rdx)
+       vmovdqu %xmm12, -0x10(%rdx)
+       mov     %r8, %rax
+       ret
+
+       .p2align 4
+L(gobble_data_movsb):
+#ifdef SHARED_CACHE_SIZE_HALF
+       mov     $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+       mov     __x86_shared_cache_size_half(%rip), %rcx
+#endif
+       shl     $3, %rcx
+       cmp     %rcx, %rdx
+       jae     L(gobble_big_data_fwd)
+       mov     %rdx, %rcx
+       mov     %rdx, %rcx
+       rep     movsb
+       ret
+
+       .p2align 4
+L(gobble_big_data_fwd):
+       lea     (%rsi, %rdx), %rcx
+       vmovdqu (%rsi), %ymm4
+       vmovdqu -0x80(%rsi,%rdx), %xmm5
+       vmovdqu -0x70(%rcx), %xmm6
+       vmovdqu -0x60(%rcx), %xmm7
+       vmovdqu -0x50(%rcx), %xmm8
+       vmovdqu -0x40(%rcx), %xmm9
+       vmovdqu -0x30(%rcx), %xmm10
+       vmovdqu -0x20(%rcx), %xmm11
+       vmovdqu -0x10(%rcx), %xmm12
+       mov     %rdi, %r8
+       and     $-32, %rdi
+       add     $32, %rdi
+       mov     %rdi, %r10
+       sub     %r8, %r10
+       sub     %r10, %rdx
+       add     %r10, %rsi
+       lea     (%rdi, %rdx), %rcx
+       add     $-0x80, %rdx
+L(gobble_mem_fwd_loop):
+       prefetchnta 0x1c0(%rsi)
+       prefetchnta 0x280(%rsi)
+       vmovdqu (%rsi), %ymm0
+       vmovdqu 0x20(%rsi), %ymm1
+       vmovdqu 0x40(%rsi), %ymm2
+       vmovdqu 0x60(%rsi), %ymm3
+       sub     $-0x80, %rsi
+       vmovntdq        %ymm0, (%rdi)
+       vmovntdq        %ymm1, 0x20(%rdi)
+       vmovntdq        %ymm2, 0x40(%rdi)
+       vmovntdq        %ymm3, 0x60(%rdi)
+       sub     $-0x80, %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_fwd_loop)
+       sfence
+       vmovdqu %ymm4, (%r8)
+       vzeroupper
+       vmovdqu %xmm5, -0x80(%rcx)
+       vmovdqu %xmm6, -0x70(%rcx)
+       vmovdqu %xmm7, -0x60(%rcx)
+       vmovdqu %xmm8, -0x50(%rcx)
+       vmovdqu %xmm9, -0x40(%rcx)
+       vmovdqu %xmm10, -0x30(%rcx)
+       vmovdqu %xmm11, -0x20(%rcx)
+       vmovdqu %xmm12, -0x10(%rcx)
+       ret
+
+#ifdef USE_AS_MEMMOVE
+       .p2align 4
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+       mov     $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+       mov     __x86_shared_cache_size_half(%rip), %rcx
+#endif
+       shl     $3, %rcx
+       vmovdqu (%rsi), %xmm5
+       vmovdqu 0x10(%rsi), %xmm6
+       add     %rdx, %rdi
+       vmovdqu 0x20(%rsi), %xmm7
+       vmovdqu 0x30(%rsi), %xmm8
+       lea     -0x20(%rdi), %r10
+       mov %rdi, %r11
+       vmovdqu 0x40(%rsi), %xmm9
+       vmovdqu 0x50(%rsi), %xmm10
+       and     $0x1f, %r11
+       vmovdqu 0x60(%rsi), %xmm11
+       vmovdqu 0x70(%rsi), %xmm12
+       xor     %r11, %rdi
+       add     %rdx, %rsi
+       vmovdqu -0x20(%rsi), %ymm4
+       sub     %r11, %rsi
+       sub     %r11, %rdx
+       cmp     %rcx, %rdx
+       ja      L(gobble_big_data_bwd)
+       add     $-0x80, %rdx
+L(gobble_mem_bwd_llc):
+       vmovdqu -0x20(%rsi), %ymm0
+       vmovdqu -0x40(%rsi), %ymm1
+       vmovdqu -0x60(%rsi), %ymm2
+       vmovdqu -0x80(%rsi), %ymm3
+       lea     -0x80(%rsi), %rsi
+       vmovdqa %ymm0, -0x20(%rdi)
+       vmovdqa %ymm1, -0x40(%rdi)
+       vmovdqa %ymm2, -0x60(%rdi)
+       vmovdqa %ymm3, -0x80(%rdi)
+       lea     -0x80(%rdi), %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_bwd_llc)
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, (%rax)
+       vmovdqu %xmm6, 0x10(%rax)
+       vmovdqu %xmm7, 0x20(%rax)
+       vmovdqu %xmm8, 0x30(%rax)
+       vmovdqu %xmm9, 0x40(%rax)
+       vmovdqu %xmm10, 0x50(%rax)
+       vmovdqu %xmm11, 0x60(%rax)
+       vmovdqu %xmm12, 0x70(%rax)
+       ret
+
+       .p2align 4
+L(gobble_big_data_bwd):
+       add     $-0x80, %rdx
+L(gobble_mem_bwd_loop):
+       prefetchnta -0x1c0(%rsi)
+       prefetchnta -0x280(%rsi)
+       vmovdqu -0x20(%rsi), %ymm0
+       vmovdqu -0x40(%rsi), %ymm1
+       vmovdqu -0x60(%rsi), %ymm2
+       vmovdqu -0x80(%rsi), %ymm3
+       lea     -0x80(%rsi), %rsi
+       vmovntdq        %ymm0, -0x20(%rdi)
+       vmovntdq        %ymm1, -0x40(%rdi)
+       vmovntdq        %ymm2, -0x60(%rdi)
+       vmovntdq        %ymm3, -0x80(%rdi)
+       lea     -0x80(%rdi), %rdi
+       add     $-0x80, %rdx
+       jb      L(gobble_mem_bwd_loop)
+       sfence
+       vmovdqu %ymm4, (%r10)
+       vzeroupper
+       vmovdqu %xmm5, (%rax)
+       vmovdqu %xmm6, 0x10(%rax)
+       vmovdqu %xmm7, 0x20(%rax)
+       vmovdqu %xmm8, 0x30(%rax)
+       vmovdqu %xmm9, 0x40(%rax)
+       vmovdqu %xmm10, 0x50(%rax)
+       vmovdqu %xmm11, 0x60(%rax)
+       vmovdqu %xmm12, 0x70(%rax)
+       ret
+#endif
+END (MEMCPY)
+#endif
index 40ae92638642a9d52a58e157fc394bb53994d332..e6666954075f924f9f3a19e199e02b1310eb416b 100644 (file)
@@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
        jne     1f
        call    __init_cpu_features
+1:     leaq    __memcpy_avx_unaligned(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz 1f
+       ret
 1:     leaq    __memcpy_sse2(%rip), %rax
        testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
        jnz     2f
index 3c0270fd2358661fd0b1131f7e4740832fe9fabe..076b19a9eac5b343d7be1a0e19e0e15ff6c48b7a 100644 (file)
@@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __memcpy_chk_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz  2f
+       leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:     ret
 END(__memcpy_chk)
 # else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644 (file)
index 0000000..faed9fb
--- /dev/null
@@ -0,0 +1,22 @@
+/* memmove with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY         __memmove_avx_unaligned
+#define MEMCPY_CHK     __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
index ba86e7bbb1cf50c85938f2cac5e4d0d17536e848..0c9af7e4dfeec1973f212a67ac7593d484677f26 100644 (file)
@@ -35,6 +35,8 @@
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+
 #endif
 
 #include "string/memmove.c"
@@ -47,10 +49,12 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-           HAS_SSSE3
-           ? (HAS_FAST_COPY_BACKWARD
-              ? __memmove_ssse3_back : __memmove_ssse3)
-           : __memmove_sse2)
+           HAS_AVX
+           ? __memmove_avx_unaligned
+           : (HAS_SSSE3
+              ? (HAS_FAST_COPY_BACKWARD
+                 ? __memmove_ssse3_back : __memmove_ssse3)
+              : __memmove_sse2));
 
 strong_alias (__libc_memmove, memmove)
 
index cb1acb65982399cd046a6dfefcf471844ec298f9..44344f2820c52c234672975326c3e659e719ec62 100644 (file)
 extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-           HAS_SSSE3
+           HAS_AVX ? __memmove_chk_avx_unaligned :
+           (HAS_SSSE3
            ? (HAS_FAST_COPY_BACKWARD
               ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-           : __memmove_chk_sse2);
+           : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644 (file)
index 0000000..438bda3
--- /dev/null
@@ -0,0 +1,22 @@
+/* mempcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY         __mempcpy_avx_unaligned
+#define MEMCPY_CHK     __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
index b9f04c2ec4d73fa6aa5f399ba5dd2c6f08a3fe1c..7589d8c1ec6b92d8d1388a00a745e59c3dfd55c4 100644 (file)
@@ -37,6 +37,9 @@ ENTRY(__mempcpy)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __mempcpy_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz      2f
+       leaq    __mempcpy_avx_unaligned(%rip), %rax
 2:     ret
 END(__mempcpy)
 
index c28473a6694cdf9b811b06a9da90f8d08111155f..88e0b74e83738d89b1508bd299d9926fe3c35a89 100644 (file)
@@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __mempcpy_chk_ssse3_back(%rip), %rax
+       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       jz      2f
+       leaq    __mempcpy_chk_avx_unaligned(%rip), %rax
 2:     ret
 END(__mempcpy_chk)
 # else