]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memset.S
Update.
[thirdparty/glibc.git] / sysdeps / x86_64 / memset.S
CommitLineData
78df0fcb
AJ
1/* memset/bzero -- set memory area to CH/0
2 Optimized version for x86-64.
3 Copyright (C) 2002 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Andreas Jaeger <aj@suse.de>.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */
21
22#include <sysdep.h>
23#include "asm-syntax.h"
24#include "bp-sym.h"
25#include "bp-asm.h"
26
27/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
28#define BZERO_P (defined memset)
29
30/* This is somehow experimental and could made dependend on the cache
31 size. */
32#define LARGE $120000
33
34 .text
35ENTRY (memset)
36#if BZERO_P
37 mov %rsi,%rdx /* Adjust parameter. */
38 xorq %rsi,%rsi /* Fill with 0s. */
39#endif
40 cmp $0x7,%rdx /* Check for small length. */
41 mov %rdi,%rcx /* Save ptr as return value. */
42 jbe 7f
43
44#if BZERO_P
45 mov %rsi,%r8 /* Just copy 0. */
46#else
47 /* Populate 8 bit data to full 64-bit. */
48 movabs $0x0101010101010101,%r8
49 movzbl %sil,%eax
50 imul %rax,%r8
51#endif
52 test $0x7,%edi /* Check for alignment. */
53 je 2f
54
55 .p2align 4
561: /* Align ptr to 8 byte. */
57 mov %sil,(%rcx)
58 dec %rdx
59 inc %rcx
60 test $0x7,%ecx
61 jne 1b
62
632: /* Check for really large regions. */
64 mov %rdx,%rax
65 shr $0x6,%rax
66 je 4f
67 cmp LARGE, %rdx
68 jae 11f
69
70 .p2align 4
713: /* Copy 64 bytes. */
72 mov %r8,(%rcx)
73 mov %r8,0x8(%rcx)
74 mov %r8,0x10(%rcx)
75 mov %r8,0x18(%rcx)
76 mov %r8,0x20(%rcx)
77 mov %r8,0x28(%rcx)
78 mov %r8,0x30(%rcx)
79 mov %r8,0x38(%rcx)
80 add $0x40,%rcx
81 dec %rax
82 jne 3b
83
844: /* Copy final bytes. */
85 and $0x3f,%edx
86 mov %rdx,%rax
87 shr $0x3,%rax
88 je 6f
89
905: /* First in chunks of 8 bytes. */
91 mov %r8,(%rcx)
92 add $0x8,%rcx
93 dec %rax
94 jne 5b
956:
96 and $0x7,%edx
977:
98 test %rdx,%rdx
99 je 9f
1008: /* And finally as bytes (up to 7). */
101 mov %sil,(%rcx)
102 inc %rcx
103 dec %rdx
104 jne 8b
1059:
106#if BZERO_P
107 nop
108#else
109 /* Load result (only if used as memset). */
110 mov %rdi,%rax /* start address of destination is result */
111#endif
112 retq
113
114 .p2align 4
11511: /* Copy 64 bytes without polluting the cache. */
116 /* We could use movntdq %xmm0,(%rcx) here to further
117 speed up for large cases but let's not use XMM registers. */
118 movnti %r8,(%rcx)
119 movnti %r8,0x8(%rcx)
120 movnti %r8,0x10(%rcx)
121 movnti %r8,0x18(%rcx)
122 movnti %r8,0x20(%rcx)
123 movnti %r8,0x28(%rcx)
124 movnti %r8,0x30(%rcx)
125 movnti %r8,0x38(%rcx)
126 add $0x40,%rcx
127 dec %rax
128 jne 11b
129 jmp 4b
130
131END (memset)