]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
x86-64: Add memset family functions with 256-bit EVEX
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memset-vec-unaligned-erms.S
CommitLineData
83056630 1/* memset/bzero with unaligned store and rep stosb
04277e02 2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
83056630
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
eb2c88c7
L
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
83056630
L
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
4af1bb06
L
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
ef9c4cb6
L
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
9650d04f
L
37#ifndef XMM0
38# define XMM0 xmm0
39#endif
40
41#ifndef YMM0
42# define YMM0 ymm0
43#endif
44
83056630
L
45#ifndef VZEROUPPER
46# if VEC_SIZE > 16
47# define VZEROUPPER vzeroupper
48# else
49# define VZEROUPPER
50# endif
51#endif
52
53#ifndef VZEROUPPER_SHORT_RETURN
54# if VEC_SIZE > 16
55# define VZEROUPPER_SHORT_RETURN vzeroupper
56# else
57# define VZEROUPPER_SHORT_RETURN rep
58# endif
59#endif
60
61#ifndef MOVQ
62# if VEC_SIZE > 16
63# define MOVQ vmovq
64# else
65# define MOVQ movq
66# endif
67#endif
68
69/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
70 up REP STOSB operation, REP STOSB isn't faster on short data. The
71 memset micro benchmark in glibc shows that 2KB is the approximate
72 value above which REP STOSB becomes faster on processors with
73 Enhanced REP STOSB. Since the stored value is fixed, larger register
74 size has minimal impact on threshold. */
75#ifndef REP_STOSB_THRESHOLD
76# define REP_STOSB_THRESHOLD 2048
77#endif
78
79#ifndef SECTION
80# error SECTION is not defined!
81#endif
82
83056630 83 .section SECTION(.text),"ax",@progbits
5e8c5bb1 84#if VEC_SIZE == 16 && IS_IN (libc)
83056630 85ENTRY (__bzero)
82d0b4a4
L
86 mov %RDI_LP, %RAX_LP /* Set return value. */
87 mov %RSI_LP, %RDX_LP /* Set n. */
9650d04f 88 pxor %XMM0, %XMM0
83056630
L
89 jmp L(entry_from_bzero)
90END (__bzero)
91weak_alias (__bzero, bzero)
92#endif
93
ef9c4cb6
L
94#if IS_IN (libc)
95# if defined SHARED
96ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
82d0b4a4 97 cmp %RDX_LP, %RCX_LP
ef9c4cb6
L
98 jb HIDDEN_JUMPTARGET (__chk_fail)
99END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
100# endif
101
102ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
82d0b4a4 103 shl $2, %RDX_LP
ef9c4cb6
L
104 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
105 jmp L(entry_from_bzero)
106END (WMEMSET_SYMBOL (__wmemset, unaligned))
107#endif
108
83056630 109#if defined SHARED && IS_IN (libc)
4af1bb06 110ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
82d0b4a4 111 cmp %RDX_LP, %RCX_LP
83056630 112 jb HIDDEN_JUMPTARGET (__chk_fail)
4af1bb06 113END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
83056630
L
114#endif
115
116ENTRY (MEMSET_SYMBOL (__memset, unaligned))
ef9c4cb6 117 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
82d0b4a4
L
118# ifdef __ILP32__
119 /* Clear the upper 32 bits. */
120 mov %edx, %edx
121# endif
83056630
L
122L(entry_from_bzero):
123 cmpq $VEC_SIZE, %rdx
124 jb L(less_vec)
125 cmpq $(VEC_SIZE * 2), %rdx
126 ja L(more_2x_vec)
127 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
128 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
129 VMOVU %VEC(0), (%rdi)
130 VZEROUPPER
131 ret
4af1bb06 132#if defined USE_MULTIARCH && IS_IN (libc)
83056630
L
133END (MEMSET_SYMBOL (__memset, unaligned))
134
4af1bb06 135# if VEC_SIZE == 16
93e46f87 136ENTRY (__memset_chk_erms)
82d0b4a4 137 cmp %RDX_LP, %RCX_LP
93e46f87
L
138 jb HIDDEN_JUMPTARGET (__chk_fail)
139END (__memset_chk_erms)
140
83056630
L
141/* Only used to measure performance of REP STOSB. */
142ENTRY (__memset_erms)
727b38df 143 /* Skip zero length. */
82d0b4a4 144 test %RDX_LP, %RDX_LP
727b38df
L
145 jnz L(stosb)
146 movq %rdi, %rax
147 ret
4af1bb06 148# else
50d7d351
L
149/* Provide a hidden symbol to debugger. */
150 .hidden MEMSET_SYMBOL (__memset, erms)
83056630 151ENTRY (MEMSET_SYMBOL (__memset, erms))
4af1bb06 152# endif
83056630 153L(stosb):
02b78ff7
L
154 /* Issue vzeroupper before rep stosb. */
155 VZEROUPPER
82d0b4a4 156 mov %RDX_LP, %RCX_LP
83056630 157 movzbl %sil, %eax
82d0b4a4 158 mov %RDI_LP, %RDX_LP
83056630 159 rep stosb
82d0b4a4 160 mov %RDX_LP, %RAX_LP
83056630 161 ret
4af1bb06 162# if VEC_SIZE == 16
83056630 163END (__memset_erms)
4af1bb06 164# else
83056630 165END (MEMSET_SYMBOL (__memset, erms))
4af1bb06 166# endif
83056630 167
4af1bb06
L
168# if defined SHARED && IS_IN (libc)
169ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
82d0b4a4 170 cmp %RDX_LP, %RCX_LP
83056630 171 jb HIDDEN_JUMPTARGET (__chk_fail)
4af1bb06
L
172END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
173# endif
83056630
L
174
175ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
ef9c4cb6 176 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
82d0b4a4
L
177# ifdef __ILP32__
178 /* Clear the upper 32 bits. */
179 mov %edx, %edx
180# endif
181 cmp $VEC_SIZE, %RDX_LP
83056630 182 jb L(less_vec)
82d0b4a4 183 cmp $(VEC_SIZE * 2), %RDX_LP
83056630
L
184 ja L(stosb_more_2x_vec)
185 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
186 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
187 VMOVU %VEC(0), (%rdi)
188 VZEROUPPER
189 ret
190
83056630
L
191L(stosb_more_2x_vec):
192 cmpq $REP_STOSB_THRESHOLD, %rdx
eb2c88c7 193 ja L(stosb)
4af1bb06 194#endif
83056630
L
195L(more_2x_vec):
196 cmpq $(VEC_SIZE * 4), %rdx
197 ja L(loop_start)
198 VMOVU %VEC(0), (%rdi)
199 VMOVU %VEC(0), VEC_SIZE(%rdi)
200 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
201 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
202L(return):
203 VZEROUPPER
204 ret
205
83056630
L
206L(loop_start):
207 leaq (VEC_SIZE * 4)(%rdi), %rcx
208 VMOVU %VEC(0), (%rdi)
209 andq $-(VEC_SIZE * 4), %rcx
210 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
211 VMOVU %VEC(0), VEC_SIZE(%rdi)
212 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
213 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
214 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
215 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
216 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
217 addq %rdi, %rdx
218 andq $-(VEC_SIZE * 4), %rdx
219 cmpq %rdx, %rcx
83056630 220 je L(return)
83056630
L
221L(loop):
222 VMOVA %VEC(0), (%rcx)
223 VMOVA %VEC(0), VEC_SIZE(%rcx)
224 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
225 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
226 addq $(VEC_SIZE * 4), %rcx
227 cmpq %rcx, %rdx
228 jne L(loop)
229 VZEROUPPER_SHORT_RETURN
230 ret
231L(less_vec):
232 /* Less than 1 VEC. */
233# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
234# error Unsupported VEC_SIZE!
235# endif
236# if VEC_SIZE > 32
237 cmpb $32, %dl
238 jae L(between_32_63)
239# endif
240# if VEC_SIZE > 16
241 cmpb $16, %dl
242 jae L(between_16_31)
243# endif
9650d04f 244 MOVQ %XMM0, %rcx
83056630
L
245 cmpb $8, %dl
246 jae L(between_8_15)
247 cmpb $4, %dl
248 jae L(between_4_7)
249 cmpb $1, %dl
250 ja L(between_2_3)
251 jb 1f
252 movb %cl, (%rdi)
2531:
254 VZEROUPPER
255 ret
256# if VEC_SIZE > 32
257 /* From 32 to 63. No branch when size == 32. */
258L(between_32_63):
9650d04f
L
259 VMOVU %YMM0, -32(%rdi,%rdx)
260 VMOVU %YMM0, (%rdi)
83056630
L
261 VZEROUPPER
262 ret
263# endif
264# if VEC_SIZE > 16
265 /* From 16 to 31. No branch when size == 16. */
266L(between_16_31):
9650d04f
L
267 VMOVU %XMM0, -16(%rdi,%rdx)
268 VMOVU %XMM0, (%rdi)
83056630
L
269 VZEROUPPER
270 ret
271# endif
272 /* From 8 to 15. No branch when size == 8. */
273L(between_8_15):
274 movq %rcx, -8(%rdi,%rdx)
275 movq %rcx, (%rdi)
276 VZEROUPPER
277 ret
278L(between_4_7):
279 /* From 4 to 7. No branch when size == 4. */
280 movl %ecx, -4(%rdi,%rdx)
281 movl %ecx, (%rdi)
282 VZEROUPPER
283 ret
284L(between_2_3):
285 /* From 2 to 3. No branch when size == 2. */
286 movw %cx, -2(%rdi,%rdx)
287 movw %cx, (%rdi)
288 VZEROUPPER
289 ret
290END (MEMSET_SYMBOL (__memset, unaligned_erms))