]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/strcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / strcpy.S
1 /* strcpy/stpcpy - copy a string returning pointer to start/end.
2 Copyright (C) 2013-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
20
21 To test the page crossing code path more thoroughly, compile with
22 -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23 the slower entry path. This option is not intended for production use. */
24
25 #include <sysdep.h>
26
27 /* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
30 */
31
32 /* Arguments and results. */
33 #define dstin x0
34 #define srcin x1
35
36 /* Locals and temporaries. */
37 #define src x2
38 #define dst x3
39 #define data1 x4
40 #define data1w w4
41 #define data2 x5
42 #define data2w w5
43 #define has_nul1 x6
44 #define has_nul2 x7
45 #define tmp1 x8
46 #define tmp2 x9
47 #define tmp3 x10
48 #define tmp4 x11
49 #define zeroones x12
50 #define data1a x13
51 #define data2a x14
52 #define pos x15
53 #define len x16
54 #define to_align x17
55
56 #ifdef BUILD_STPCPY
57 #define STRCPY __stpcpy
58 #else
59 #define STRCPY strcpy
60 #endif
61
62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64 can be done in parallel across the entire word. */
65
66 #define REP8_01 0x0101010101010101
67 #define REP8_7f 0x7f7f7f7f7f7f7f7f
68 #define REP8_80 0x8080808080808080
69
70 /* AArch64 systems have a minimum page size of 4k. We can do a quick
71 page size check for crossing this boundary on entry and if we
72 do not, then we can short-circuit much of the entry code. We
73 expect early page-crossing strings to be rare (probability of
74 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
75 predictable, even with random strings.
76
77 We don't bother checking for larger page sizes, the cost of setting
78 up the correct page size is just not worth the extra gain from
79 a small reduction in the cases taking the slow path. Note that
80 we only care about whether the first fetch, which may be
81 misaligned, crosses a page boundary - after that we move to aligned
82 fetches for the remainder of the string. */
83
84 #ifdef STRCPY_TEST_PAGE_CROSS
85 /* Make everything that isn't Qword aligned look like a page cross. */
86 #define MIN_PAGE_P2 4
87 #else
88 #define MIN_PAGE_P2 12
89 #endif
90
91 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
92
93 ENTRY_ALIGN (STRCPY, 6)
94 DELOUSE (0)
95 DELOUSE (1)
96 /* For moderately short strings, the fastest way to do the copy is to
97 calculate the length of the string in the same way as strlen, then
98 essentially do a memcpy of the result. This avoids the need for
99 multiple byte copies and further means that by the time we
100 reach the bulk copy loop we know we can always use DWord
101 accesses. We expect strcpy to rarely be called repeatedly
102 with the same source string, so branch prediction is likely to
103 always be difficult - we mitigate against this by preferring
104 conditional select operations over branches whenever this is
105 feasible. */
106 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
107 mov zeroones, #REP8_01
108 and to_align, srcin, #15
109 cmp tmp2, #(MIN_PAGE_SIZE - 16)
110 neg tmp1, to_align
111 /* The first fetch will straddle a (possible) page boundary iff
112 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
113 aligned string will never fail the page align check, so will
114 always take the fast path. */
115 b.gt L(page_cross)
116
117 L(page_cross_ok):
118 ldp data1, data2, [srcin]
119 #ifdef __AARCH64EB__
120 /* Because we expect the end to be found within 16 characters
121 (profiling shows this is the most common case), it's worth
122 swapping the bytes now to save having to recalculate the
123 termination syndrome later. We preserve data1 and data2
124 so that we can re-use the values later on. */
125 rev tmp2, data1
126 sub tmp1, tmp2, zeroones
127 orr tmp2, tmp2, #REP8_7f
128 bics has_nul1, tmp1, tmp2
129 b.ne L(fp_le8)
130 rev tmp4, data2
131 sub tmp3, tmp4, zeroones
132 orr tmp4, tmp4, #REP8_7f
133 #else
134 sub tmp1, data1, zeroones
135 orr tmp2, data1, #REP8_7f
136 bics has_nul1, tmp1, tmp2
137 b.ne L(fp_le8)
138 sub tmp3, data2, zeroones
139 orr tmp4, data2, #REP8_7f
140 #endif
141 bics has_nul2, tmp3, tmp4
142 b.eq L(bulk_entry)
143
144 /* The string is short (<=16 bytes). We don't know exactly how
145 short though, yet. Work out the exact length so that we can
146 quickly select the optimal copy strategy. */
147 L(fp_gt8):
148 rev has_nul2, has_nul2
149 clz pos, has_nul2
150 mov tmp2, #56
151 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
152 sub pos, tmp2, pos
153 #ifdef __AARCH64EB__
154 lsr data2, data2, pos
155 #else
156 lsl data2, data2, pos
157 #endif
158 str data2, [dst, #1]
159 str data1, [dstin]
160 #ifdef BUILD_STPCPY
161 add dstin, dst, #8
162 #endif
163 ret
164
165 L(fp_le8):
166 rev has_nul1, has_nul1
167 clz pos, has_nul1
168 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
169 subs tmp2, pos, #24 /* Pos in bits. */
170 b.lt L(fp_lt4)
171 #ifdef __AARCH64EB__
172 mov tmp2, #56
173 sub pos, tmp2, pos
174 lsr data2, data1, pos
175 lsr data1, data1, #32
176 #else
177 lsr data2, data1, tmp2
178 #endif
179 /* 4->7 bytes to copy. */
180 str data2w, [dst, #-3]
181 str data1w, [dstin]
182 #ifdef BUILD_STPCPY
183 mov dstin, dst
184 #endif
185 ret
186 L(fp_lt4):
187 cbz pos, L(fp_lt2)
188 /* 2->3 bytes to copy. */
189 #ifdef __AARCH64EB__
190 lsr data1, data1, #48
191 #endif
192 strh data1w, [dstin]
193 /* Fall-through, one byte (max) to go. */
194 L(fp_lt2):
195 /* Null-terminated string. Last character must be zero! */
196 strb wzr, [dst]
197 #ifdef BUILD_STPCPY
198 mov dstin, dst
199 #endif
200 ret
201
202 .p2align 6
203 /* Aligning here ensures that the entry code and main loop all lies
204 within one 64-byte cache line. */
205 L(bulk_entry):
206 sub to_align, to_align, #16
207 stp data1, data2, [dstin]
208 sub src, srcin, to_align
209 sub dst, dstin, to_align
210 b L(entry_no_page_cross)
211
212 /* The inner loop deals with two Dwords at a time. This has a
213 slightly higher start-up cost, but we should win quite quickly,
214 especially on cores with a high number of issue slots per
215 cycle, as we get much better parallelism out of the operations. */
216 L(main_loop):
217 stp data1, data2, [dst], #16
218 L(entry_no_page_cross):
219 ldp data1, data2, [src], #16
220 sub tmp1, data1, zeroones
221 orr tmp2, data1, #REP8_7f
222 sub tmp3, data2, zeroones
223 orr tmp4, data2, #REP8_7f
224 bic has_nul1, tmp1, tmp2
225 bics has_nul2, tmp3, tmp4
226 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
227 b.eq L(main_loop)
228
229 /* Since we know we are copying at least 16 bytes, the fastest way
230 to deal with the tail is to determine the location of the
231 trailing NUL, then (re)copy the 16 bytes leading up to that. */
232 cmp has_nul1, #0
233 #ifdef __AARCH64EB__
234 /* For big-endian, carry propagation (if the final byte in the
235 string is 0x01) means we cannot use has_nul directly. The
236 easiest way to get the correct byte is to byte-swap the data
237 and calculate the syndrome a second time. */
238 csel data1, data1, data2, ne
239 rev data1, data1
240 sub tmp1, data1, zeroones
241 orr tmp2, data1, #REP8_7f
242 bic has_nul1, tmp1, tmp2
243 #else
244 csel has_nul1, has_nul1, has_nul2, ne
245 #endif
246 rev has_nul1, has_nul1
247 clz pos, has_nul1
248 add tmp1, pos, #72
249 add pos, pos, #8
250 csel pos, pos, tmp1, ne
251 add src, src, pos, lsr #3
252 add dst, dst, pos, lsr #3
253 ldp data1, data2, [src, #-32]
254 stp data1, data2, [dst, #-16]
255 #ifdef BUILD_STPCPY
256 sub dstin, dst, #1
257 #endif
258 ret
259
260 L(page_cross):
261 bic src, srcin, #15
262 /* Start by loading two words at [srcin & ~15], then forcing the
263 bytes that precede srcin to 0xff. This means they never look
264 like termination bytes. */
265 ldp data1, data2, [src]
266 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
267 tst to_align, #7
268 csetm tmp2, ne
269 #ifdef __AARCH64EB__
270 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
271 #else
272 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
273 #endif
274 orr data1, data1, tmp2
275 orr data2a, data2, tmp2
276 cmp to_align, #8
277 csinv data1, data1, xzr, lt
278 csel data2, data2, data2a, lt
279 sub tmp1, data1, zeroones
280 orr tmp2, data1, #REP8_7f
281 sub tmp3, data2, zeroones
282 orr tmp4, data2, #REP8_7f
283 bic has_nul1, tmp1, tmp2
284 bics has_nul2, tmp3, tmp4
285 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
286 b.eq L(page_cross_ok)
287 /* We now need to make data1 and data2 look like they've been
288 loaded directly from srcin. Do a rotate on the 128-bit value. */
289 lsl tmp1, to_align, #3 /* Bytes->bits. */
290 neg tmp2, to_align, lsl #3
291 #ifdef __AARCH64EB__
292 lsl data1a, data1, tmp1
293 lsr tmp4, data2, tmp2
294 lsl data2, data2, tmp1
295 orr tmp4, tmp4, data1a
296 cmp to_align, #8
297 csel data1, tmp4, data2, lt
298 rev tmp2, data1
299 rev tmp4, data2
300 sub tmp1, tmp2, zeroones
301 orr tmp2, tmp2, #REP8_7f
302 sub tmp3, tmp4, zeroones
303 orr tmp4, tmp4, #REP8_7f
304 #else
305 lsr data1a, data1, tmp1
306 lsl tmp4, data2, tmp2
307 lsr data2, data2, tmp1
308 orr tmp4, tmp4, data1a
309 cmp to_align, #8
310 csel data1, tmp4, data2, lt
311 sub tmp1, data1, zeroones
312 orr tmp2, data1, #REP8_7f
313 sub tmp3, data2, zeroones
314 orr tmp4, data2, #REP8_7f
315 #endif
316 bic has_nul1, tmp1, tmp2
317 cbnz has_nul1, L(fp_le8)
318 bic has_nul2, tmp3, tmp4
319 b L(fp_gt8)
320 END (STRCPY)
321
322 #ifdef BUILD_STPCPY
323 weak_alias (__stpcpy, stpcpy)
324 libc_hidden_def (__stpcpy)
325 libc_hidden_builtin_def (stpcpy)
326 #else
327 libc_hidden_builtin_def (strcpy)
328 #endif