/* memset/bzero -- set memory area to CH/0
Optimized version for x86-64.
- Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2002-2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Andreas Jaeger <aj@suse.de>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "asm-syntax.h"
-#include "bp-sym.h"
-#include "bp-asm.h"
-/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
-#define BZERO_P (defined memset)
-
-/* This is somehow experimental and could made dependend on the cache
- size. */
-#define LARGE $120000
+ .text
+#if IS_IN (libc)
+ENTRY(__bzero)
+ movq %rdi, %rax /* Set return value. */
+ movq %rsi, %rdx /* Set n. */
+ pxor %xmm0, %xmm0
+ jmp L(entry_from_bzero)
+END(__bzero)
+weak_alias (__bzero, bzero)
+
+/* Like memset but takes additional parameter with return value. */
+ENTRY(__memset_tail)
+ movq %rcx, %rax /* Set return value. */
+
+ movd %esi, %xmm0
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+
+ jmp L(entry_from_bzero)
+END(__memset_tail)
+#endif
- .text
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
-ENTRY (__memset_chk)
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memset_chk)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk)
-#endif
-ENTRY (memset)
-#if BZERO_P
- mov %rsi,%rdx /* Adjust parameter. */
- xorl %esi,%esi /* Fill with 0s. */
+END_CHK (__memset_chk)
#endif
- cmp $0x7,%rdx /* Check for small length. */
- mov %rdi,%rcx /* Save ptr as return value. */
- jbe 7f
-
-#if BZERO_P
- mov %rsi,%r8 /* Just copy 0. */
-#else
- /* Populate 8 bit data to full 64-bit. */
- movabs $0x0101010101010101,%r8
- movzbl %sil,%eax
- imul %rax,%r8
-#endif
- test $0x7,%edi /* Check for alignment. */
- je 2f
+ENTRY (memset)
+ movd %esi, %xmm0
+ movq %rdi, %rax
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+L(entry_from_bzero):
+ cmpq $64, %rdx
+ ja L(loop_start)
+ cmpq $16, %rdx
+ jbe L(less_16_bytes)
+ cmpq $32, %rdx
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm0, -16(%rdi,%rdx)
+ ja L(between_32_64_bytes)
+L(return):
+ rep
+ ret
.p2align 4
-1: /* Align ptr to 8 byte. */
- mov %sil,(%rcx)
- dec %rdx
- inc %rcx
- test $0x7,%ecx
- jne 1b
-
-2: /* Check for really large regions. */
- mov %rdx,%rax
- shr $0x6,%rax
- je 4f
- cmp LARGE, %rdx
- jae 11f
-
+L(between_32_64_bytes):
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi,%rdx)
+ ret
.p2align 4
-3: /* Copy 64 bytes. */
- mov %r8,(%rcx)
- mov %r8,0x8(%rcx)
- mov %r8,0x10(%rcx)
- mov %r8,0x18(%rcx)
- mov %r8,0x20(%rcx)
- mov %r8,0x28(%rcx)
- mov %r8,0x30(%rcx)
- mov %r8,0x38(%rcx)
- add $0x40,%rcx
- dec %rax
- jne 3b
-
-4: /* Copy final bytes. */
- and $0x3f,%edx
- mov %rdx,%rax
- shr $0x3,%rax
- je 6f
-
-5: /* First in chunks of 8 bytes. */
- mov %r8,(%rcx)
- add $0x8,%rcx
- dec %rax
- jne 5b
-6:
- and $0x7,%edx
-7:
- test %rdx,%rdx
- je 9f
-8: /* And finally as bytes (up to 7). */
- mov %sil,(%rcx)
- inc %rcx
- dec %rdx
- jne 8b
-9:
-#if BZERO_P
- nop
-#else
- /* Load result (only if used as memset). */
- mov %rdi,%rax /* start address of destination is result */
-#endif
- retq
-
+L(loop_start):
+ leaq 64(%rdi), %rcx
+ movdqu %xmm0, (%rdi)
+ andq $-64, %rcx
+ movdqu %xmm0, -16(%rdi,%rdx)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi,%rdx)
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm0, -48(%rdi,%rdx)
+ movdqu %xmm0, 48(%rdi)
+ movdqu %xmm0, -64(%rdi,%rdx)
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ cmpq %rdx, %rcx
+ je L(return)
.p2align 4
-11: /* Copy 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
- speed up for large cases but let's not use XMM registers. */
- movnti %r8,(%rcx)
- movnti %r8,0x8(%rcx)
- movnti %r8,0x10(%rcx)
- movnti %r8,0x18(%rcx)
- movnti %r8,0x20(%rcx)
- movnti %r8,0x28(%rcx)
- movnti %r8,0x30(%rcx)
- movnti %r8,0x38(%rcx)
- add $0x40,%rcx
- dec %rax
- jne 11b
- jmp 4b
+L(loop):
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ movdqa %xmm0, 32(%rcx)
+ movdqa %xmm0, 48(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ rep
+ ret
+L(less_16_bytes):
+ movq %xmm0, %rcx
+ testb $24, %dl
+ jne L(between8_16bytes)
+ testb $4, %dl
+ jne L(between4_7bytes)
+ testb $1, %dl
+ je L(odd_byte)
+ movb %cl, (%rdi)
+L(odd_byte):
+ testb $2, %dl
+ je L(return)
+ movw %cx, -2(%rax,%rdx)
+ ret
+L(between4_7bytes):
+ movl %ecx, (%rdi)
+ movl %ecx, -4(%rdi,%rdx)
+ ret
+L(between8_16bytes):
+ movq %rcx, (%rdi)
+ movq %rcx, -8(%rdi,%rdx)
+ ret
END (memset)
-#if !BZERO_P
libc_hidden_builtin_def (memset)
-#endif
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+#if defined PIC && IS_IN (libc) && !defined USE_MULTIARCH
strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
.section .gnu.warning.__memset_zero_constant_len_parameter
.string "memset used with constant zero length parameter; this could be due to transposed parameters"