]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
x86-64: Add memset family functions with 256-bit EVEX
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memset-vec-unaligned-erms.S
1 /* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27 #include <sysdep.h>
28
29 #ifndef MEMSET_CHK_SYMBOL
30 # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31 #endif
32
33 #ifndef WMEMSET_CHK_SYMBOL
34 # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35 #endif
36
37 #ifndef XMM0
38 # define XMM0 xmm0
39 #endif
40
41 #ifndef YMM0
42 # define YMM0 ymm0
43 #endif
44
45 #ifndef VZEROUPPER
46 # if VEC_SIZE > 16
47 # define VZEROUPPER vzeroupper
48 # else
49 # define VZEROUPPER
50 # endif
51 #endif
52
53 #ifndef VZEROUPPER_SHORT_RETURN
54 # if VEC_SIZE > 16
55 # define VZEROUPPER_SHORT_RETURN vzeroupper
56 # else
57 # define VZEROUPPER_SHORT_RETURN rep
58 # endif
59 #endif
60
61 #ifndef MOVQ
62 # if VEC_SIZE > 16
63 # define MOVQ vmovq
64 # else
65 # define MOVQ movq
66 # endif
67 #endif
68
69 #ifndef SECTION
70 # error SECTION is not defined!
71 #endif
72
73 .section SECTION(.text),"ax",@progbits
74 #if VEC_SIZE == 16 && IS_IN (libc)
75 ENTRY (__bzero)
76 mov %RDI_LP, %RAX_LP /* Set return value. */
77 mov %RSI_LP, %RDX_LP /* Set n. */
78 pxor %XMM0, %XMM0
79 jmp L(entry_from_bzero)
80 END (__bzero)
81 weak_alias (__bzero, bzero)
82 #endif
83
84 #if IS_IN (libc)
85 # if defined SHARED
86 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
87 cmp %RDX_LP, %RCX_LP
88 jb HIDDEN_JUMPTARGET (__chk_fail)
89 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
90 # endif
91
92 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
93 shl $2, %RDX_LP
94 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
95 jmp L(entry_from_bzero)
96 END (WMEMSET_SYMBOL (__wmemset, unaligned))
97 #endif
98
99 #if defined SHARED && IS_IN (libc)
100 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
101 cmp %RDX_LP, %RCX_LP
102 jb HIDDEN_JUMPTARGET (__chk_fail)
103 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
104 #endif
105
106 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
107 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
108 # ifdef __ILP32__
109 /* Clear the upper 32 bits. */
110 mov %edx, %edx
111 # endif
112 L(entry_from_bzero):
113 cmpq $VEC_SIZE, %rdx
114 jb L(less_vec)
115 cmpq $(VEC_SIZE * 2), %rdx
116 ja L(more_2x_vec)
117 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
118 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
119 VMOVU %VEC(0), (%rdi)
120 VZEROUPPER
121 ret
122 #if defined USE_MULTIARCH && IS_IN (libc)
123 END (MEMSET_SYMBOL (__memset, unaligned))
124
125 # if VEC_SIZE == 16
126 ENTRY (__memset_chk_erms)
127 cmp %RDX_LP, %RCX_LP
128 jb HIDDEN_JUMPTARGET (__chk_fail)
129 END (__memset_chk_erms)
130
131 /* Only used to measure performance of REP STOSB. */
132 ENTRY (__memset_erms)
133 /* Skip zero length. */
134 test %RDX_LP, %RDX_LP
135 jnz L(stosb)
136 movq %rdi, %rax
137 ret
138 # else
139 /* Provide a hidden symbol to debugger. */
140 .hidden MEMSET_SYMBOL (__memset, erms)
141 ENTRY (MEMSET_SYMBOL (__memset, erms))
142 # endif
143 L(stosb):
144 /* Issue vzeroupper before rep stosb. */
145 VZEROUPPER
146 mov %RDX_LP, %RCX_LP
147 movzbl %sil, %eax
148 mov %RDI_LP, %RDX_LP
149 rep stosb
150 mov %RDX_LP, %RAX_LP
151 ret
152 # if VEC_SIZE == 16
153 END (__memset_erms)
154 # else
155 END (MEMSET_SYMBOL (__memset, erms))
156 # endif
157
158 # if defined SHARED && IS_IN (libc)
159 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
160 cmp %RDX_LP, %RCX_LP
161 jb HIDDEN_JUMPTARGET (__chk_fail)
162 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
163 # endif
164
165 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
166 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
167 # ifdef __ILP32__
168 /* Clear the upper 32 bits. */
169 mov %edx, %edx
170 # endif
171 cmp $VEC_SIZE, %RDX_LP
172 jb L(less_vec)
173 cmp $(VEC_SIZE * 2), %RDX_LP
174 ja L(stosb_more_2x_vec)
175 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
176 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
177 VMOVU %VEC(0), (%rdi)
178 VZEROUPPER
179 ret
180
181 L(stosb_more_2x_vec):
182 cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
183 ja L(stosb)
184 #endif
185 L(more_2x_vec):
186 cmpq $(VEC_SIZE * 4), %rdx
187 ja L(loop_start)
188 VMOVU %VEC(0), (%rdi)
189 VMOVU %VEC(0), VEC_SIZE(%rdi)
190 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
191 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
192 L(return):
193 VZEROUPPER
194 ret
195
196 L(loop_start):
197 leaq (VEC_SIZE * 4)(%rdi), %rcx
198 VMOVU %VEC(0), (%rdi)
199 andq $-(VEC_SIZE * 4), %rcx
200 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
201 VMOVU %VEC(0), VEC_SIZE(%rdi)
202 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
203 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
204 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
205 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
206 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
207 addq %rdi, %rdx
208 andq $-(VEC_SIZE * 4), %rdx
209 cmpq %rdx, %rcx
210 je L(return)
211 L(loop):
212 VMOVA %VEC(0), (%rcx)
213 VMOVA %VEC(0), VEC_SIZE(%rcx)
214 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
215 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
216 addq $(VEC_SIZE * 4), %rcx
217 cmpq %rcx, %rdx
218 jne L(loop)
219 VZEROUPPER_SHORT_RETURN
220 ret
221 L(less_vec):
222 /* Less than 1 VEC. */
223 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
224 # error Unsupported VEC_SIZE!
225 # endif
226 # if VEC_SIZE > 32
227 cmpb $32, %dl
228 jae L(between_32_63)
229 # endif
230 # if VEC_SIZE > 16
231 cmpb $16, %dl
232 jae L(between_16_31)
233 # endif
234 MOVQ %XMM0, %rcx
235 cmpb $8, %dl
236 jae L(between_8_15)
237 cmpb $4, %dl
238 jae L(between_4_7)
239 cmpb $1, %dl
240 ja L(between_2_3)
241 jb 1f
242 movb %cl, (%rdi)
243 1:
244 VZEROUPPER
245 ret
246 # if VEC_SIZE > 32
247 /* From 32 to 63. No branch when size == 32. */
248 L(between_32_63):
249 VMOVU %YMM0, -32(%rdi,%rdx)
250 VMOVU %YMM0, (%rdi)
251 VZEROUPPER
252 ret
253 # endif
254 # if VEC_SIZE > 16
255 /* From 16 to 31. No branch when size == 16. */
256 L(between_16_31):
257 VMOVU %XMM0, -16(%rdi,%rdx)
258 VMOVU %XMM0, (%rdi)
259 VZEROUPPER
260 ret
261 # endif
262 /* From 8 to 15. No branch when size == 8. */
263 L(between_8_15):
264 movq %rcx, -8(%rdi,%rdx)
265 movq %rcx, (%rdi)
266 VZEROUPPER
267 ret
268 L(between_4_7):
269 /* From 4 to 7. No branch when size == 4. */
270 movl %ecx, -4(%rdi,%rdx)
271 movl %ecx, (%rdi)
272 VZEROUPPER
273 ret
274 L(between_2_3):
275 /* From 2 to 3. No branch when size == 2. */
276 movw %cx, -2(%rdi,%rdx)
277 movw %cx, (%rdi)
278 VZEROUPPER
279 ret
280 END (MEMSET_SYMBOL (__memset, unaligned_erms))