1 /* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
29 #ifndef MEMSET_CHK_SYMBOL
30 # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
33 #ifndef WMEMSET_CHK_SYMBOL
34 # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
39 # define VZEROUPPER vzeroupper
45 #ifndef VZEROUPPER_SHORT_RETURN
47 # define VZEROUPPER_SHORT_RETURN vzeroupper
49 # define VZEROUPPER_SHORT_RETURN rep
61 /* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67 #ifndef REP_STOSB_THRESHOLD
68 # define REP_STOSB_THRESHOLD 2048
72 # error SECTION is not defined!
75 .section SECTION(.text),"ax",@progbits
76 #if VEC_SIZE == 16 && IS_IN (libc)
78 movq %rdi, %rax /* Set return value. */
79 movq %rsi, %rdx /* Set n. */
81 jmp L(entry_from_bzero)
83 weak_alias (__bzero, bzero)
88 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
94 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98 END (WMEMSET_SYMBOL (__wmemset, unaligned))
101 #if defined SHARED && IS_IN (libc)
102 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
108 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
113 cmpq $(VEC_SIZE * 2), %rdx
115 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
116 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
117 VMOVU %VEC(0), (%rdi)
120 #if defined USE_MULTIARCH && IS_IN (libc)
121 END (MEMSET_SYMBOL (__memset, unaligned))
124 ENTRY (__memset_chk_erms)
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127 END (__memset_chk_erms)
129 /* Only used to measure performance of REP STOSB. */
130 ENTRY (__memset_erms)
131 /* Skip zero length. */
137 /* Provide a hidden symbol to debugger. */
138 .hidden MEMSET_SYMBOL (__memset, erms)
139 ENTRY (MEMSET_SYMBOL (__memset, erms))
142 /* Issue vzeroupper before rep stosb. */
153 END (MEMSET_SYMBOL (__memset, erms))
156 # if defined SHARED && IS_IN (libc)
157 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
159 jb HIDDEN_JUMPTARGET (__chk_fail)
160 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
163 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
164 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
167 cmpq $(VEC_SIZE * 2), %rdx
168 ja L(stosb_more_2x_vec)
169 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
170 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
171 VMOVU %VEC(0), (%rdi)
175 L(stosb_more_2x_vec):
176 cmpq $REP_STOSB_THRESHOLD, %rdx
180 cmpq $(VEC_SIZE * 4), %rdx
182 VMOVU %VEC(0), (%rdi)
183 VMOVU %VEC(0), VEC_SIZE(%rdi)
184 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
185 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
191 leaq (VEC_SIZE * 4)(%rdi), %rcx
192 VMOVU %VEC(0), (%rdi)
193 andq $-(VEC_SIZE * 4), %rcx
194 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
195 VMOVU %VEC(0), VEC_SIZE(%rdi)
196 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
197 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
198 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
199 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
200 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
202 andq $-(VEC_SIZE * 4), %rdx
206 VMOVA %VEC(0), (%rcx)
207 VMOVA %VEC(0), VEC_SIZE(%rcx)
208 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
209 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
210 addq $(VEC_SIZE * 4), %rcx
213 VZEROUPPER_SHORT_RETURN
216 /* Less than 1 VEC. */
217 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
218 # error Unsupported VEC_SIZE!
241 /* From 32 to 63. No branch when size == 32. */
243 vmovdqu %ymm0, -32(%rdi,%rdx)
244 vmovdqu %ymm0, (%rdi)
249 /* From 16 to 31. No branch when size == 16. */
251 vmovdqu %xmm0, -16(%rdi,%rdx)
252 vmovdqu %xmm0, (%rdi)
256 /* From 8 to 15. No branch when size == 8. */
258 movq %rcx, -8(%rdi,%rdx)
263 /* From 4 to 7. No branch when size == 4. */
264 movl %ecx, -4(%rdi,%rdx)
269 /* From 2 to 3. No branch when size == 2. */
270 movw %cx, -2(%rdi,%rdx)
274 END (MEMSET_SYMBOL (__memset, unaligned_erms))