]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/memset.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / memset.S
1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19 /* Assumptions:
20 *
21 * ARMv8-a, AArch64
22 * Unaligned accesses
23 *
24 */
25
26 #include <sysdep.h>
27
28 /* By default we assume that the DC instruction can be used to zero
29 data blocks more efficiently. In some circumstances this might be
30 unsafe, for example in an asymmetric multiprocessor environment with
31 different DC clear lengths (neither the upper nor lower lengths are
32 safe to use). The feature can be disabled by defining DONT_USE_DC.
33
34 If code may be run in a virtualized environment, then define
35 MAYBE_VIRT. This will cause the code to cache the system register
36 values rather than re-reading them each call. */
37
38 #define dstin x0
39 #define val w1
40 #define count x2
41 #define tmp1 x3
42 #define tmp1w w3
43 #define tmp2 x4
44 #define tmp2w w4
45 #define zva_len_x x5
46 #define zva_len w5
47 #define zva_bits_x x6
48
49 #define A_l x7
50 #define A_lw w7
51 #define dst x8
52 #define tmp3w w9
53
54 ENTRY_ALIGN (__memset, 6)
55
56 mov dst, dstin /* Preserve return value. */
57 ands A_lw, val, #255
58 #ifndef DONT_USE_DC
59 b.eq L(zero_mem)
60 #endif
61 orr A_lw, A_lw, A_lw, lsl #8
62 orr A_lw, A_lw, A_lw, lsl #16
63 orr A_l, A_l, A_l, lsl #32
64 L(tail_maybe_long):
65 cmp count, #64
66 b.ge L(not_short)
67 L(tail_maybe_tiny):
68 cmp count, #15
69 b.le L(tail15tiny)
70 L(tail63):
71 ands tmp1, count, #0x30
72 b.eq L(tail15)
73 add dst, dst, tmp1
74 cmp tmp1w, #0x20
75 b.eq 1f
76 b.lt 2f
77 stp A_l, A_l, [dst, #-48]
78 1:
79 stp A_l, A_l, [dst, #-32]
80 2:
81 stp A_l, A_l, [dst, #-16]
82
83 L(tail15):
84 and count, count, #15
85 add dst, dst, count
86 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
87 RET
88
89 L(tail15tiny):
90 /* Set up to 15 bytes. Does not assume earlier memory
91 being set. */
92 tbz count, #3, 1f
93 str A_l, [dst], #8
94 1:
95 tbz count, #2, 1f
96 str A_lw, [dst], #4
97 1:
98 tbz count, #1, 1f
99 strh A_lw, [dst], #2
100 1:
101 tbz count, #0, 1f
102 strb A_lw, [dst]
103 1:
104 RET
105
106 /* Critical loop. Start at a new cache line boundary. Assuming
107 * 64 bytes per line, this ensures the entire loop is in one line. */
108 .p2align 6
109 L(not_short):
110 neg tmp2, dst
111 ands tmp2, tmp2, #15
112 b.eq 2f
113 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
114 * more than that to set, so we simply store 16 bytes and advance by
115 * the amount required to reach alignment. */
116 sub count, count, tmp2
117 stp A_l, A_l, [dst]
118 add dst, dst, tmp2
119 /* There may be less than 63 bytes to go now. */
120 cmp count, #63
121 b.le L(tail63)
122 2:
123 sub dst, dst, #16 /* Pre-bias. */
124 sub count, count, #64
125 1:
126 stp A_l, A_l, [dst, #16]
127 stp A_l, A_l, [dst, #32]
128 stp A_l, A_l, [dst, #48]
129 stp A_l, A_l, [dst, #64]!
130 subs count, count, #64
131 b.ge 1b
132 tst count, #0x3f
133 add dst, dst, #16
134 b.ne L(tail63)
135 RET
136
137 #ifndef DONT_USE_DC
138 /* For zeroing memory, check to see if we can use the ZVA feature to
139 * zero entire 'cache' lines. */
140 L(zero_mem):
141 mov A_l, #0
142 cmp count, #63
143 b.le L(tail_maybe_tiny)
144 neg tmp2, dst
145 ands tmp2, tmp2, #15
146 b.eq 1f
147 sub count, count, tmp2
148 stp A_l, A_l, [dst]
149 add dst, dst, tmp2
150 cmp count, #63
151 b.le L(tail63)
152 1:
153 /* For zeroing small amounts of memory, it's not worth setting up
154 * the line-clear code. */
155 cmp count, #128
156 b.lt L(not_short)
157 #ifdef MAYBE_VIRT
158 /* For efficiency when virtualized, we cache the ZVA capability. */
159 adrp tmp2, L(cache_clear)
160 ldr zva_len, [tmp2, #:lo12:L(cache_clear)]
161 tbnz zva_len, #31, L(not_short)
162 cbnz zva_len, L(zero_by_line)
163 mrs tmp1, dczid_el0
164 tbz tmp1, #4, 1f
165 /* ZVA not available. Remember this for next time. */
166 mov zva_len, #~0
167 str zva_len, [tmp2, #:lo12:L(cache_clear)]
168 b L(not_short)
169 1:
170 mov tmp3w, #4
171 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
172 lsl zva_len, tmp3w, zva_len
173 str zva_len, [tmp2, #:lo12:L(cache_clear)]
174 #else
175 mrs tmp1, dczid_el0
176 tbnz tmp1, #4, L(not_short)
177 mov tmp3w, #4
178 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
179 lsl zva_len, tmp3w, zva_len
180 #endif
181
182 L(zero_by_line):
183 /* Compute how far we need to go to become suitably aligned. We're
184 * already at quad-word alignment. */
185 cmp count, zva_len_x
186 b.lt L(not_short) /* Not enough to reach alignment. */
187 sub zva_bits_x, zva_len_x, #1
188 neg tmp2, dst
189 ands tmp2, tmp2, zva_bits_x
190 b.eq 1f /* Already aligned. */
191 /* Not aligned, check that there's enough to copy after alignment. */
192 sub tmp1, count, tmp2
193 cmp tmp1, #64
194 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
195 b.lt L(not_short)
196 /* We know that there's at least 64 bytes to zero and that it's safe
197 * to overrun by 64 bytes. */
198 mov count, tmp1
199 2:
200 stp A_l, A_l, [dst]
201 stp A_l, A_l, [dst, #16]
202 stp A_l, A_l, [dst, #32]
203 subs tmp2, tmp2, #64
204 stp A_l, A_l, [dst, #48]
205 add dst, dst, #64
206 b.ge 2b
207 /* We've overrun a bit, so adjust dst downwards. */
208 add dst, dst, tmp2
209 1:
210 sub count, count, zva_len_x
211 3:
212 dc zva, dst
213 add dst, dst, zva_len_x
214 subs count, count, zva_len_x
215 b.ge 3b
216 ands count, count, zva_bits_x
217 b.ne L(tail_maybe_long)
218 RET
219 #ifdef MAYBE_VIRT
220 .bss
221 .p2align 2
222 L(cache_clear):
223 .space 4
224 #endif
225 #endif /* DONT_USE_DC */
226
227 END (__memset)
228 weak_alias (__memset, memset)
229 libc_hidden_builtin_def (memset)