]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memset-vec-unaligned-erms.S
1 /* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 /* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27 #include <sysdep.h>
28
29 #ifndef MEMSET_CHK_SYMBOL
30 # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31 #endif
32
33 #ifndef WMEMSET_CHK_SYMBOL
34 # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35 #endif
36
37 #ifndef VZEROUPPER
38 # if VEC_SIZE > 16
39 # define VZEROUPPER vzeroupper
40 # else
41 # define VZEROUPPER
42 # endif
43 #endif
44
45 #ifndef VZEROUPPER_SHORT_RETURN
46 # if VEC_SIZE > 16
47 # define VZEROUPPER_SHORT_RETURN vzeroupper
48 # else
49 # define VZEROUPPER_SHORT_RETURN rep
50 # endif
51 #endif
52
53 #ifndef MOVQ
54 # if VEC_SIZE > 16
55 # define MOVQ vmovq
56 # else
57 # define MOVQ movq
58 # endif
59 #endif
60
61 /* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67 #ifndef REP_STOSB_THRESHOLD
68 # define REP_STOSB_THRESHOLD 2048
69 #endif
70
71 #ifndef SECTION
72 # error SECTION is not defined!
73 #endif
74
75 .section SECTION(.text),"ax",@progbits
76 #if VEC_SIZE == 16 && IS_IN (libc)
77 ENTRY (__bzero)
78 movq %rdi, %rax /* Set return value. */
79 movq %rsi, %rdx /* Set n. */
80 pxor %xmm0, %xmm0
81 jmp L(entry_from_bzero)
82 END (__bzero)
83 weak_alias (__bzero, bzero)
84 #endif
85
86 #if IS_IN (libc)
87 # if defined SHARED
88 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
89 cmpq %rdx, %rcx
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
92 # endif
93
94 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
95 shlq $2, %rdx
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98 END (WMEMSET_SYMBOL (__wmemset, unaligned))
99 #endif
100
101 #if defined SHARED && IS_IN (libc)
102 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
103 cmpq %rdx, %rcx
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
106 #endif
107
108 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
110 L(entry_from_bzero):
111 cmpq $VEC_SIZE, %rdx
112 jb L(less_vec)
113 cmpq $(VEC_SIZE * 2), %rdx
114 ja L(more_2x_vec)
115 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
116 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
117 VMOVU %VEC(0), (%rdi)
118 VZEROUPPER
119 ret
120 #if defined USE_MULTIARCH && IS_IN (libc)
121 END (MEMSET_SYMBOL (__memset, unaligned))
122
123 # if VEC_SIZE == 16
124 ENTRY (__memset_chk_erms)
125 cmpq %rdx, %rcx
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127 END (__memset_chk_erms)
128
129 /* Only used to measure performance of REP STOSB. */
130 ENTRY (__memset_erms)
131 /* Skip zero length. */
132 testq %rdx, %rdx
133 jnz L(stosb)
134 movq %rdi, %rax
135 ret
136 # else
137 /* Provide a hidden symbol to debugger. */
138 .hidden MEMSET_SYMBOL (__memset, erms)
139 ENTRY (MEMSET_SYMBOL (__memset, erms))
140 # endif
141 L(stosb):
142 /* Issue vzeroupper before rep stosb. */
143 VZEROUPPER
144 movq %rdx, %rcx
145 movzbl %sil, %eax
146 movq %rdi, %rdx
147 rep stosb
148 movq %rdx, %rax
149 ret
150 # if VEC_SIZE == 16
151 END (__memset_erms)
152 # else
153 END (MEMSET_SYMBOL (__memset, erms))
154 # endif
155
156 # if defined SHARED && IS_IN (libc)
157 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
158 cmpq %rdx, %rcx
159 jb HIDDEN_JUMPTARGET (__chk_fail)
160 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
161 # endif
162
163 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
164 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
165 cmpq $VEC_SIZE, %rdx
166 jb L(less_vec)
167 cmpq $(VEC_SIZE * 2), %rdx
168 ja L(stosb_more_2x_vec)
169 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
170 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
171 VMOVU %VEC(0), (%rdi)
172 VZEROUPPER
173 ret
174
175 L(stosb_more_2x_vec):
176 cmpq $REP_STOSB_THRESHOLD, %rdx
177 ja L(stosb)
178 #endif
179 L(more_2x_vec):
180 cmpq $(VEC_SIZE * 4), %rdx
181 ja L(loop_start)
182 VMOVU %VEC(0), (%rdi)
183 VMOVU %VEC(0), VEC_SIZE(%rdi)
184 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
185 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
186 L(return):
187 VZEROUPPER
188 ret
189
190 L(loop_start):
191 leaq (VEC_SIZE * 4)(%rdi), %rcx
192 VMOVU %VEC(0), (%rdi)
193 andq $-(VEC_SIZE * 4), %rcx
194 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
195 VMOVU %VEC(0), VEC_SIZE(%rdi)
196 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
197 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
198 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
199 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
200 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
201 addq %rdi, %rdx
202 andq $-(VEC_SIZE * 4), %rdx
203 cmpq %rdx, %rcx
204 je L(return)
205 L(loop):
206 VMOVA %VEC(0), (%rcx)
207 VMOVA %VEC(0), VEC_SIZE(%rcx)
208 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
209 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
210 addq $(VEC_SIZE * 4), %rcx
211 cmpq %rcx, %rdx
212 jne L(loop)
213 VZEROUPPER_SHORT_RETURN
214 ret
215 L(less_vec):
216 /* Less than 1 VEC. */
217 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
218 # error Unsupported VEC_SIZE!
219 # endif
220 # if VEC_SIZE > 32
221 cmpb $32, %dl
222 jae L(between_32_63)
223 # endif
224 # if VEC_SIZE > 16
225 cmpb $16, %dl
226 jae L(between_16_31)
227 # endif
228 MOVQ %xmm0, %rcx
229 cmpb $8, %dl
230 jae L(between_8_15)
231 cmpb $4, %dl
232 jae L(between_4_7)
233 cmpb $1, %dl
234 ja L(between_2_3)
235 jb 1f
236 movb %cl, (%rdi)
237 1:
238 VZEROUPPER
239 ret
240 # if VEC_SIZE > 32
241 /* From 32 to 63. No branch when size == 32. */
242 L(between_32_63):
243 vmovdqu %ymm0, -32(%rdi,%rdx)
244 vmovdqu %ymm0, (%rdi)
245 VZEROUPPER
246 ret
247 # endif
248 # if VEC_SIZE > 16
249 /* From 16 to 31. No branch when size == 16. */
250 L(between_16_31):
251 vmovdqu %xmm0, -16(%rdi,%rdx)
252 vmovdqu %xmm0, (%rdi)
253 VZEROUPPER
254 ret
255 # endif
256 /* From 8 to 15. No branch when size == 8. */
257 L(between_8_15):
258 movq %rcx, -8(%rdi,%rdx)
259 movq %rcx, (%rdi)
260 VZEROUPPER
261 ret
262 L(between_4_7):
263 /* From 4 to 7. No branch when size == 4. */
264 movl %ecx, -4(%rdi,%rdx)
265 movl %ecx, (%rdi)
266 VZEROUPPER
267 ret
268 L(between_2_3):
269 /* From 2 to 3. No branch when size == 2. */
270 movw %cx, -2(%rdi,%rdx)
271 movw %cx, (%rdi)
272 VZEROUPPER
273 ret
274 END (MEMSET_SYMBOL (__memset, unaligned_erms))