]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
Add x86-64 memset with unaligned store and rep stosb
authorH.J. Lu <hjl.tools@gmail.com>
Thu, 31 Mar 2016 17:05:51 +0000 (10:05 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Thu, 31 Mar 2016 17:06:07 +0000 (10:06 -0700)
Implement x86-64 memset with unaligned store and rep movsb.  Support
16-byte, 32-byte and 64-byte vector register sizes.  A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb.  They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which defaults
to 2KB.

Key features:

1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.

[BZ #19881]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.

ChangeLog
sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S [new file with mode: 0644]

index 100764f66acbb3e459bf9e79b41a38e87f9916ad..1a87d43e1139da917296a74dc947148543a8d274 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2016-03-31   H.J. Lu  <hongjiu.lu@intel.com>
+
+       [BZ #19881]
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+       memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
+       memset-avx512-unaligned-erms.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
+       __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
+       __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
+       __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
+       __memset_sse2_unaligned_erms, __memset_erms,
+       __memset_avx2_unaligned, __memset_avx2_unaligned_erms,
+       __memset_avx512_unaligned_erms and __memset_avx512_unaligned.
+       * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
+       file.
+       * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
+       Likewise.
+       * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
+       Likewise.
+       * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
+       Likewise.
+
 2016-03-31   H.J. Lu  <hongjiu.lu@intel.com>
 
        [BZ #19776]
index ef4dbc0c6f07dea58db0b5f02f3b08cdb46072f9..8878efbc8fed943beaef8e1e37f21d1a90b80083 100644 (file)
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                   memset-avx512-no-vzeroupper \
                   memmove-sse2-unaligned-erms \
                   memmove-avx-unaligned-erms \
-                  memmove-avx512-unaligned-erms
+                  memmove-avx512-unaligned-erms \
+                  memset-sse2-unaligned-erms \
+                  memset-avx2-unaligned-erms \
+                  memset-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
index 9204da450a50e7a0d9f4c95f9513e2bd639a6da7..1e880f6edcd1213332a03c71c77965a2569b1606 100644 (file)
@@ -118,10 +118,26 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, __memset_chk,
              IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
                              __memset_chk_sse2)
+             IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+                             __memset_chk_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+                             __memset_chk_sse2_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memset_chk,
                              HAS_ARCH_FEATURE (AVX2_Usable),
                              __memset_chk_avx2)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_chk_avx2_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_chk_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memset_chk_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memset_chk_avx512_unaligned)
              IFUNC_IMPL_ADD (array, i, __memset_chk,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memset_chk_avx512_no_vzeroupper)
@@ -131,10 +147,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
              IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+             IFUNC_IMPL_ADD (array, i, memset, 1,
+                             __memset_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, memset, 1,
+                             __memset_sse2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
              IFUNC_IMPL_ADD (array, i, memset,
                              HAS_ARCH_FEATURE (AVX2_Usable),
                              __memset_avx2)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_avx2_unaligned)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
+             IFUNC_IMPL_ADD (array, i, memset,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memset_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memset_avx512_unaligned)
              IFUNC_IMPL_ADD (array, i, memset,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memset_avx512_no_vzeroupper)
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644 (file)
index 0000000..e0dc565
--- /dev/null
@@ -0,0 +1,14 @@
+#define VEC_SIZE       32
+#define VEC(i)         ymm##i
+#define VMOVU          vmovdqu
+#define VMOVA          vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p)             p##.avx
+#define MEMSET_SYMBOL(p,s)     p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644 (file)
index 0000000..72f4095
--- /dev/null
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE      64
+# define VEC(i)                zmm##i
+# define VMOVU         vmovdqu64
+# define VMOVA         vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p)            p##.avx512
+# define MEMSET_SYMBOL(p,s)    p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644 (file)
index 0000000..437a858
--- /dev/null
@@ -0,0 +1,16 @@
+#define VEC_SIZE       16
+#define VEC(i)         xmm##i
+#define VMOVU          movdqu
+#define VMOVA          movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)             p
+#define MEMSET_SYMBOL(p,s)     p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644 (file)
index 0000000..9383517
--- /dev/null
@@ -0,0 +1,251 @@
+/* memset/bzero with unaligned store and rep stosb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memset is implemented as:
+   1. Use overlapping store to avoid branch.
+   2. Force 32-bit displacement for branches to avoid long nop between
+      instructions.
+   3. If size is less than VEC, use integer register stores.
+   4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.  */
+
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER                   vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+#  define VZEROUPPER_SHORT_RETURN      vzeroupper
+# else
+#  define VZEROUPPER_SHORT_RETURN      rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+#  define MOVQ                         vmovq
+# else
+#  define MOVQ                         movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD           2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+       .section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+       movq    %rdi, %rax /* Set return value.  */
+       movq    %rsi, %rdx /* Set n.  */
+       pxor    %xmm0, %xmm0
+       jmp     L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+       cmpq    $VEC_SIZE, %rdx
+       jb      L(less_vec)
+       cmpq    $(VEC_SIZE * 2), %rdx
+       ja      L(more_2x_vec)
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), (%rdi)
+       VZEROUPPER
+       ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger.  */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+       movq    %rdx, %rcx
+       movzbl  %sil, %eax
+       movq    %rdi, %rdx
+       rep stosb
+       movq    %rdx, %rax
+       ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+       cmpq    $VEC_SIZE, %rdx
+       jb      L(less_vec)
+       cmpq    $(VEC_SIZE * 2), %rdx
+       ja      L(stosb_more_2x_vec)
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), (%rdi)
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(stosb_more_2x_vec):
+       cmpq    $REP_STOSB_THRESHOLD, %rdx
+       /* Force 32-bit displacement to avoid long nop between
+          instructions.  */
+       ja.d32  L(stosb)
+       .p2align 4
+L(more_2x_vec):
+       cmpq  $(VEC_SIZE * 4), %rdx
+       ja      L(loop_start)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(0), VEC_SIZE(%rdi)
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(loop_start):
+       leaq    (VEC_SIZE * 4)(%rdi), %rcx
+       VMOVU   %VEC(0), (%rdi)
+       andq    $-(VEC_SIZE * 4), %rcx
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), VEC_SIZE(%rdi)
+       VMOVU   %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+       VMOVU   %VEC(0), (VEC_SIZE * 2)(%rdi)
+       VMOVU   %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+       VMOVU   %VEC(0), (VEC_SIZE * 3)(%rdi)
+       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+       addq    %rdi, %rdx
+       andq    $-(VEC_SIZE * 4), %rdx
+       cmpq    %rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+       /* Force 32-bit displacement to avoid long nop between
+          instructions.  */
+       je.d32  L(return)
+# else
+       je      L(return)
+# endif
+       .p2align 4
+L(loop):
+       VMOVA   %VEC(0), (%rcx)
+       VMOVA   %VEC(0), VEC_SIZE(%rcx)
+       VMOVA   %VEC(0), (VEC_SIZE * 2)(%rcx)
+       VMOVA   %VEC(0), (VEC_SIZE * 3)(%rcx)
+       addq    $(VEC_SIZE * 4), %rcx
+       cmpq    %rcx, %rdx
+       jne     L(loop)
+       VZEROUPPER_SHORT_RETURN
+       ret
+L(less_vec):
+       /* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+       cmpb    $32, %dl
+       jae     L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+       cmpb    $16, %dl
+       jae     L(between_16_31)
+# endif
+       MOVQ    %xmm0, %rcx
+       cmpb    $8, %dl
+       jae     L(between_8_15)
+       cmpb    $4, %dl
+       jae     L(between_4_7)
+       cmpb    $1, %dl
+       ja      L(between_2_3)
+       jb      1f
+       movb    %cl, (%rdi)
+1:
+       VZEROUPPER
+       ret
+# if VEC_SIZE > 32
+       /* From 32 to 63.  No branch when size == 32.  */
+L(between_32_63):
+       vmovdqu %ymm0, -32(%rdi,%rdx)
+       vmovdqu %ymm0, (%rdi)
+       VZEROUPPER
+       ret
+# endif
+# if VEC_SIZE > 16
+       /* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+       vmovdqu %xmm0, -16(%rdi,%rdx)
+       vmovdqu %xmm0, (%rdi)
+       VZEROUPPER
+       ret
+# endif
+       /* From 8 to 15.  No branch when size == 8.  */
+L(between_8_15):
+       movq    %rcx, -8(%rdi,%rdx)
+       movq    %rcx, (%rdi)
+       VZEROUPPER
+       ret
+L(between_4_7):
+       /* From 4 to 7.  No branch when size == 4.  */
+       movl    %ecx, -4(%rdi,%rdx)
+       movl    %ecx, (%rdi)
+       VZEROUPPER
+       ret
+L(between_2_3):
+       /* From 2 to 3.  No branch when size == 2.  */
+       movw    %cx, -2(%rdi,%rdx)
+       movw    %cx, (%rdi)
+       VZEROUPPER
+       ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))