]>
Commit | Line | Data |
---|---|---|
78df0fcb AJ |
1 | /* memset/bzero -- set memory area to CH/0 |
2 | Optimized version for x86-64. | |
3 | Copyright (C) 2002 Free Software Foundation, Inc. | |
4 | This file is part of the GNU C Library. | |
5 | Contributed by Andreas Jaeger <aj@suse.de>. | |
6 | ||
7 | The GNU C Library is free software; you can redistribute it and/or | |
8 | modify it under the terms of the GNU Lesser General Public | |
9 | License as published by the Free Software Foundation; either | |
10 | version 2.1 of the License, or (at your option) any later version. | |
11 | ||
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | Lesser General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU Lesser General Public | |
18 | License along with the GNU C Library; if not, write to the Free | |
19 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
20 | 02111-1307 USA. */ | |
21 | ||
22 | #include <sysdep.h> | |
23 | #include "asm-syntax.h" | |
24 | #include "bp-sym.h" | |
25 | #include "bp-asm.h" | |
26 | ||
27 | /* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ | |
28 | #define BZERO_P (defined memset) | |
29 | ||
30 | /* This is somehow experimental and could made dependend on the cache | |
31 | size. */ | |
32 | #define LARGE $120000 | |
33 | ||
34 | .text | |
35 | ENTRY (memset) | |
36 | #if BZERO_P | |
37 | mov %rsi,%rdx /* Adjust parameter. */ | |
38 | xorq %rsi,%rsi /* Fill with 0s. */ | |
39 | #endif | |
40 | cmp $0x7,%rdx /* Check for small length. */ | |
41 | mov %rdi,%rcx /* Save ptr as return value. */ | |
42 | jbe 7f | |
43 | ||
44 | #if BZERO_P | |
45 | mov %rsi,%r8 /* Just copy 0. */ | |
46 | #else | |
47 | /* Populate 8 bit data to full 64-bit. */ | |
48 | movabs $0x0101010101010101,%r8 | |
49 | movzbl %sil,%eax | |
50 | imul %rax,%r8 | |
51 | #endif | |
52 | test $0x7,%edi /* Check for alignment. */ | |
53 | je 2f | |
54 | ||
55 | .p2align 4 | |
56 | 1: /* Align ptr to 8 byte. */ | |
57 | mov %sil,(%rcx) | |
58 | dec %rdx | |
59 | inc %rcx | |
60 | test $0x7,%ecx | |
61 | jne 1b | |
62 | ||
63 | 2: /* Check for really large regions. */ | |
64 | mov %rdx,%rax | |
65 | shr $0x6,%rax | |
66 | je 4f | |
67 | cmp LARGE, %rdx | |
68 | jae 11f | |
69 | ||
70 | .p2align 4 | |
71 | 3: /* Copy 64 bytes. */ | |
72 | mov %r8,(%rcx) | |
73 | mov %r8,0x8(%rcx) | |
74 | mov %r8,0x10(%rcx) | |
75 | mov %r8,0x18(%rcx) | |
76 | mov %r8,0x20(%rcx) | |
77 | mov %r8,0x28(%rcx) | |
78 | mov %r8,0x30(%rcx) | |
79 | mov %r8,0x38(%rcx) | |
80 | add $0x40,%rcx | |
81 | dec %rax | |
82 | jne 3b | |
83 | ||
84 | 4: /* Copy final bytes. */ | |
85 | and $0x3f,%edx | |
86 | mov %rdx,%rax | |
87 | shr $0x3,%rax | |
88 | je 6f | |
89 | ||
90 | 5: /* First in chunks of 8 bytes. */ | |
91 | mov %r8,(%rcx) | |
92 | add $0x8,%rcx | |
93 | dec %rax | |
94 | jne 5b | |
95 | 6: | |
96 | and $0x7,%edx | |
97 | 7: | |
98 | test %rdx,%rdx | |
99 | je 9f | |
100 | 8: /* And finally as bytes (up to 7). */ | |
101 | mov %sil,(%rcx) | |
102 | inc %rcx | |
103 | dec %rdx | |
104 | jne 8b | |
105 | 9: | |
106 | #if BZERO_P | |
107 | nop | |
108 | #else | |
109 | /* Load result (only if used as memset). */ | |
110 | mov %rdi,%rax /* start address of destination is result */ | |
111 | #endif | |
112 | retq | |
113 | ||
114 | .p2align 4 | |
115 | 11: /* Copy 64 bytes without polluting the cache. */ | |
116 | /* We could use movntdq %xmm0,(%rcx) here to further | |
117 | speed up for large cases but let's not use XMM registers. */ | |
118 | movnti %r8,(%rcx) | |
119 | movnti %r8,0x8(%rcx) | |
120 | movnti %r8,0x10(%rcx) | |
121 | movnti %r8,0x18(%rcx) | |
122 | movnti %r8,0x20(%rcx) | |
123 | movnti %r8,0x28(%rcx) | |
124 | movnti %r8,0x30(%rcx) | |
125 | movnti %r8,0x38(%rcx) | |
126 | add $0x40,%rcx | |
127 | dec %rax | |
128 | jne 11b | |
129 | jmp 4b | |
130 | ||
131 | END (memset) |