]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memset-vec-unaligned-erms.S
CommitLineData
83056630 1/* memset/bzero with unaligned store and rep stosb
04277e02 2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
83056630
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
83056630
L
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
eb2c88c7
L
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
83056630
L
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
4af1bb06
L
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
ef9c4cb6
L
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
83056630
L
37#ifndef VZEROUPPER
38# if VEC_SIZE > 16
39# define VZEROUPPER vzeroupper
40# else
41# define VZEROUPPER
42# endif
43#endif
44
45#ifndef VZEROUPPER_SHORT_RETURN
46# if VEC_SIZE > 16
47# define VZEROUPPER_SHORT_RETURN vzeroupper
48# else
49# define VZEROUPPER_SHORT_RETURN rep
50# endif
51#endif
52
53#ifndef MOVQ
54# if VEC_SIZE > 16
55# define MOVQ vmovq
56# else
57# define MOVQ movq
58# endif
59#endif
60
61/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67#ifndef REP_STOSB_THRESHOLD
68# define REP_STOSB_THRESHOLD 2048
69#endif
70
71#ifndef SECTION
72# error SECTION is not defined!
73#endif
74
83056630 75 .section SECTION(.text),"ax",@progbits
5e8c5bb1 76#if VEC_SIZE == 16 && IS_IN (libc)
83056630 77ENTRY (__bzero)
82d0b4a4
L
78 mov %RDI_LP, %RAX_LP /* Set return value. */
79 mov %RSI_LP, %RDX_LP /* Set n. */
83056630
L
80 pxor %xmm0, %xmm0
81 jmp L(entry_from_bzero)
82END (__bzero)
83weak_alias (__bzero, bzero)
84#endif
85
ef9c4cb6
L
86#if IS_IN (libc)
87# if defined SHARED
88ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
82d0b4a4 89 cmp %RDX_LP, %RCX_LP
ef9c4cb6
L
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
92# endif
93
94ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
82d0b4a4 95 shl $2, %RDX_LP
ef9c4cb6
L
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98END (WMEMSET_SYMBOL (__wmemset, unaligned))
99#endif
100
83056630 101#if defined SHARED && IS_IN (libc)
4af1bb06 102ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
82d0b4a4 103 cmp %RDX_LP, %RCX_LP
83056630 104 jb HIDDEN_JUMPTARGET (__chk_fail)
4af1bb06 105END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
83056630
L
106#endif
107
108ENTRY (MEMSET_SYMBOL (__memset, unaligned))
ef9c4cb6 109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
82d0b4a4
L
110# ifdef __ILP32__
111 /* Clear the upper 32 bits. */
112 mov %edx, %edx
113# endif
83056630
L
114L(entry_from_bzero):
115 cmpq $VEC_SIZE, %rdx
116 jb L(less_vec)
117 cmpq $(VEC_SIZE * 2), %rdx
118 ja L(more_2x_vec)
119 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
120 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
121 VMOVU %VEC(0), (%rdi)
122 VZEROUPPER
123 ret
4af1bb06 124#if defined USE_MULTIARCH && IS_IN (libc)
83056630
L
125END (MEMSET_SYMBOL (__memset, unaligned))
126
4af1bb06 127# if VEC_SIZE == 16
93e46f87 128ENTRY (__memset_chk_erms)
82d0b4a4 129 cmp %RDX_LP, %RCX_LP
93e46f87
L
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131END (__memset_chk_erms)
132
83056630
L
133/* Only used to measure performance of REP STOSB. */
134ENTRY (__memset_erms)
727b38df 135 /* Skip zero length. */
82d0b4a4 136 test %RDX_LP, %RDX_LP
727b38df
L
137 jnz L(stosb)
138 movq %rdi, %rax
139 ret
4af1bb06 140# else
50d7d351
L
141/* Provide a hidden symbol to debugger. */
142 .hidden MEMSET_SYMBOL (__memset, erms)
83056630 143ENTRY (MEMSET_SYMBOL (__memset, erms))
4af1bb06 144# endif
83056630 145L(stosb):
02b78ff7
L
146 /* Issue vzeroupper before rep stosb. */
147 VZEROUPPER
82d0b4a4 148 mov %RDX_LP, %RCX_LP
83056630 149 movzbl %sil, %eax
82d0b4a4 150 mov %RDI_LP, %RDX_LP
83056630 151 rep stosb
82d0b4a4 152 mov %RDX_LP, %RAX_LP
83056630 153 ret
4af1bb06 154# if VEC_SIZE == 16
83056630 155END (__memset_erms)
4af1bb06 156# else
83056630 157END (MEMSET_SYMBOL (__memset, erms))
4af1bb06 158# endif
83056630 159
4af1bb06
L
160# if defined SHARED && IS_IN (libc)
161ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
82d0b4a4 162 cmp %RDX_LP, %RCX_LP
83056630 163 jb HIDDEN_JUMPTARGET (__chk_fail)
4af1bb06
L
164END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
165# endif
83056630
L
166
167ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
ef9c4cb6 168 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
82d0b4a4
L
169# ifdef __ILP32__
170 /* Clear the upper 32 bits. */
171 mov %edx, %edx
172# endif
173 cmp $VEC_SIZE, %RDX_LP
83056630 174 jb L(less_vec)
82d0b4a4 175 cmp $(VEC_SIZE * 2), %RDX_LP
83056630
L
176 ja L(stosb_more_2x_vec)
177 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
178 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
179 VMOVU %VEC(0), (%rdi)
180 VZEROUPPER
181 ret
182
83056630
L
183L(stosb_more_2x_vec):
184 cmpq $REP_STOSB_THRESHOLD, %rdx
eb2c88c7 185 ja L(stosb)
4af1bb06 186#endif
83056630
L
187L(more_2x_vec):
188 cmpq $(VEC_SIZE * 4), %rdx
189 ja L(loop_start)
190 VMOVU %VEC(0), (%rdi)
191 VMOVU %VEC(0), VEC_SIZE(%rdi)
192 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
193 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
194L(return):
195 VZEROUPPER
196 ret
197
83056630
L
198L(loop_start):
199 leaq (VEC_SIZE * 4)(%rdi), %rcx
200 VMOVU %VEC(0), (%rdi)
201 andq $-(VEC_SIZE * 4), %rcx
202 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
203 VMOVU %VEC(0), VEC_SIZE(%rdi)
204 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
205 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
206 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
207 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
208 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
209 addq %rdi, %rdx
210 andq $-(VEC_SIZE * 4), %rdx
211 cmpq %rdx, %rcx
83056630 212 je L(return)
83056630
L
213L(loop):
214 VMOVA %VEC(0), (%rcx)
215 VMOVA %VEC(0), VEC_SIZE(%rcx)
216 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
217 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
218 addq $(VEC_SIZE * 4), %rcx
219 cmpq %rcx, %rdx
220 jne L(loop)
221 VZEROUPPER_SHORT_RETURN
222 ret
223L(less_vec):
224 /* Less than 1 VEC. */
225# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
226# error Unsupported VEC_SIZE!
227# endif
228# if VEC_SIZE > 32
229 cmpb $32, %dl
230 jae L(between_32_63)
231# endif
232# if VEC_SIZE > 16
233 cmpb $16, %dl
234 jae L(between_16_31)
235# endif
236 MOVQ %xmm0, %rcx
237 cmpb $8, %dl
238 jae L(between_8_15)
239 cmpb $4, %dl
240 jae L(between_4_7)
241 cmpb $1, %dl
242 ja L(between_2_3)
243 jb 1f
244 movb %cl, (%rdi)
2451:
246 VZEROUPPER
247 ret
248# if VEC_SIZE > 32
249 /* From 32 to 63. No branch when size == 32. */
250L(between_32_63):
251 vmovdqu %ymm0, -32(%rdi,%rdx)
252 vmovdqu %ymm0, (%rdi)
253 VZEROUPPER
254 ret
255# endif
256# if VEC_SIZE > 16
257 /* From 16 to 31. No branch when size == 16. */
258L(between_16_31):
259 vmovdqu %xmm0, -16(%rdi,%rdx)
260 vmovdqu %xmm0, (%rdi)
261 VZEROUPPER
262 ret
263# endif
264 /* From 8 to 15. No branch when size == 8. */
265L(between_8_15):
266 movq %rcx, -8(%rdi,%rdx)
267 movq %rcx, (%rdi)
268 VZEROUPPER
269 ret
270L(between_4_7):
271 /* From 4 to 7. No branch when size == 4. */
272 movl %ecx, -4(%rdi,%rdx)
273 movl %ecx, (%rdi)
274 VZEROUPPER
275 ret
276L(between_2_3):
277 /* From 2 to 3. No branch when size == 2. */
278 movw %cx, -2(%rdi,%rdx)
279 movw %cx, (%rdi)
280 VZEROUPPER
281 ret
282END (MEMSET_SYMBOL (__memset, unaligned_erms))