]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/memmove.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / memmove.S
CommitLineData
b168057a 1/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
c2b6221e
MS
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* Assumptions:
22 *
23 * ARMv8-a, AArch64
24 * Unaligned accesses
25 */
26
27/* Parameters and result. */
28#define dstin x0
29#define src x1
30#define count x2
31#define tmp1 x3
32#define tmp1w w3
33#define tmp2 x4
34#define tmp2w w4
35#define tmp3 x5
36#define tmp3w w5
37#define dst x6
38
39#define A_l x7
40#define A_h x8
41#define B_l x9
42#define B_h x10
43#define C_l x11
44#define C_h x12
45#define D_l x13
46#define D_h x14
47
48ENTRY_ALIGN (memmove, 6)
49
50 cmp dstin, src
51 b.lo L(downwards)
52 add tmp1, src, count
53 cmp dstin, tmp1
54 b.hs memcpy /* No overlap. */
55
56 /* Upwards move with potential overlap.
57 * Need to move from the tail backwards. SRC and DST point one
58 * byte beyond the remaining data to move. */
59 add dst, dstin, count
60 add src, src, count
61 cmp count, #64
62 b.ge L(mov_not_short_up)
63
64 /* Deal with small moves quickly by dropping straight into the
65 * exit block. */
66L(tail63up):
67 /* Move up to 48 bytes of data. At this point we only need the
68 * bottom 6 bits of count to be accurate. */
69 ands tmp1, count, #0x30
70 b.eq L(tail15up)
71 sub dst, dst, tmp1
72 sub src, src, tmp1
73 cmp tmp1w, #0x20
74 b.eq 1f
75 b.lt 2f
76 ldp A_l, A_h, [src, #32]
77 stp A_l, A_h, [dst, #32]
781:
79 ldp A_l, A_h, [src, #16]
80 stp A_l, A_h, [dst, #16]
812:
82 ldp A_l, A_h, [src]
83 stp A_l, A_h, [dst]
84L(tail15up):
85 /* Move up to 15 bytes of data. Does not assume additional data
86 * being moved. */
87 tbz count, #3, 1f
88 ldr tmp1, [src, #-8]!
89 str tmp1, [dst, #-8]!
901:
91 tbz count, #2, 1f
92 ldr tmp1w, [src, #-4]!
93 str tmp1w, [dst, #-4]!
941:
95 tbz count, #1, 1f
96 ldrh tmp1w, [src, #-2]!
97 strh tmp1w, [dst, #-2]!
981:
99 tbz count, #0, 1f
100 ldrb tmp1w, [src, #-1]
101 strb tmp1w, [dst, #-1]
1021:
103 RET
104
105L(mov_not_short_up):
106 /* We don't much care about the alignment of DST, but we want SRC
107 * to be 128-bit (16 byte) aligned so that we don't cross cache line
108 * boundaries on both loads and stores. */
109 ands tmp2, src, #15 /* Bytes to reach alignment. */
110 b.eq 2f
111 sub count, count, tmp2
112 /* Move enough data to reach alignment; unlike memcpy, we have to
113 * be aware of the overlap, which means we can't move data twice. */
114 tbz tmp2, #3, 1f
115 ldr tmp1, [src, #-8]!
116 str tmp1, [dst, #-8]!
1171:
118 tbz tmp2, #2, 1f
119 ldr tmp1w, [src, #-4]!
120 str tmp1w, [dst, #-4]!
1211:
122 tbz tmp2, #1, 1f
123 ldrh tmp1w, [src, #-2]!
124 strh tmp1w, [dst, #-2]!
1251:
126 tbz tmp2, #0, 1f
127 ldrb tmp1w, [src, #-1]!
128 strb tmp1w, [dst, #-1]!
1291:
130
131 /* There may be less than 63 bytes to go now. */
132 cmp count, #63
133 b.le L(tail63up)
1342:
135 subs count, count, #128
136 b.ge L(mov_body_large_up)
137 /* Less than 128 bytes to move, so handle 64 here and then jump
138 * to the tail. */
139 ldp A_l, A_h, [src, #-64]!
140 ldp B_l, B_h, [src, #16]
141 ldp C_l, C_h, [src, #32]
142 ldp D_l, D_h, [src, #48]
143 stp A_l, A_h, [dst, #-64]!
144 stp B_l, B_h, [dst, #16]
145 stp C_l, C_h, [dst, #32]
146 stp D_l, D_h, [dst, #48]
147 tst count, #0x3f
148 b.ne L(tail63up)
149 RET
150
151 /* Critical loop. Start at a new Icache line boundary. Assuming
152 * 64 bytes per line this ensures the entire loop is in one line. */
153 .p2align 6
154L(mov_body_large_up):
155 /* There are at least 128 bytes to move. */
156 ldp A_l, A_h, [src, #-16]
157 ldp B_l, B_h, [src, #-32]
158 ldp C_l, C_h, [src, #-48]
159 ldp D_l, D_h, [src, #-64]!
1601:
161 stp A_l, A_h, [dst, #-16]
162 ldp A_l, A_h, [src, #-16]
163 stp B_l, B_h, [dst, #-32]
164 ldp B_l, B_h, [src, #-32]
165 stp C_l, C_h, [dst, #-48]
166 ldp C_l, C_h, [src, #-48]
167 stp D_l, D_h, [dst, #-64]!
168 ldp D_l, D_h, [src, #-64]!
169 subs count, count, #64
170 b.ge 1b
171 stp A_l, A_h, [dst, #-16]
172 stp B_l, B_h, [dst, #-32]
173 stp C_l, C_h, [dst, #-48]
174 stp D_l, D_h, [dst, #-64]!
175 tst count, #0x3f
176 b.ne L(tail63up)
177 RET
178
179L(downwards):
180 /* For a downwards move we can safely use memcpy provided that
181 * DST is more than 16 bytes away from SRC. */
182 sub tmp1, src, #16
183 cmp dstin, tmp1
184 b.ls memcpy /* May overlap, but not critically. */
185
186 mov dst, dstin /* Preserve DSTIN for return value. */
187 cmp count, #64
188 b.ge L(mov_not_short_down)
189
190 /* Deal with small moves quickly by dropping straight into the
191 * exit block. */
192L(tail63down):
193 /* Move up to 48 bytes of data. At this point we only need the
194 * bottom 6 bits of count to be accurate. */
195 ands tmp1, count, #0x30
196 b.eq L(tail15down)
197 add dst, dst, tmp1
198 add src, src, tmp1
199 cmp tmp1w, #0x20
200 b.eq 1f
201 b.lt 2f
202 ldp A_l, A_h, [src, #-48]
203 stp A_l, A_h, [dst, #-48]
2041:
205 ldp A_l, A_h, [src, #-32]
206 stp A_l, A_h, [dst, #-32]
2072:
208 ldp A_l, A_h, [src, #-16]
209 stp A_l, A_h, [dst, #-16]
210L(tail15down):
211 /* Move up to 15 bytes of data. Does not assume additional data
212 being moved. */
213 tbz count, #3, 1f
214 ldr tmp1, [src], #8
215 str tmp1, [dst], #8
2161:
217 tbz count, #2, 1f
218 ldr tmp1w, [src], #4
219 str tmp1w, [dst], #4
2201:
221 tbz count, #1, 1f
222 ldrh tmp1w, [src], #2
223 strh tmp1w, [dst], #2
2241:
225 tbz count, #0, 1f
226 ldrb tmp1w, [src]
227 strb tmp1w, [dst]
2281:
229 RET
230
231L(mov_not_short_down):
232 /* We don't much care about the alignment of DST, but we want SRC
233 * to be 128-bit (16 byte) aligned so that we don't cross cache line
234 * boundaries on both loads and stores. */
235 neg tmp2, src
236 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
237 b.eq 2f
238 sub count, count, tmp2
239 /* Move enough data to reach alignment; unlike memcpy, we have to
240 * be aware of the overlap, which means we can't move data twice. */
241 tbz tmp2, #3, 1f
242 ldr tmp1, [src], #8
243 str tmp1, [dst], #8
2441:
245 tbz tmp2, #2, 1f
246 ldr tmp1w, [src], #4
247 str tmp1w, [dst], #4
2481:
249 tbz tmp2, #1, 1f
250 ldrh tmp1w, [src], #2
251 strh tmp1w, [dst], #2
2521:
253 tbz tmp2, #0, 1f
254 ldrb tmp1w, [src], #1
255 strb tmp1w, [dst], #1
2561:
257
258 /* There may be less than 63 bytes to go now. */
259 cmp count, #63
260 b.le L(tail63down)
2612:
262 subs count, count, #128
263 b.ge L(mov_body_large_down)
264 /* Less than 128 bytes to move, so handle 64 here and then jump
265 * to the tail. */
266 ldp A_l, A_h, [src]
267 ldp B_l, B_h, [src, #16]
268 ldp C_l, C_h, [src, #32]
269 ldp D_l, D_h, [src, #48]
270 stp A_l, A_h, [dst]
271 stp B_l, B_h, [dst, #16]
272 stp C_l, C_h, [dst, #32]
273 stp D_l, D_h, [dst, #48]
274 tst count, #0x3f
275 add src, src, #64
276 add dst, dst, #64
277 b.ne L(tail63down)
278 RET
279
280 /* Critical loop. Start at a new cache line boundary. Assuming
281 * 64 bytes per line this ensures the entire loop is in one line. */
282 .p2align 6
283L(mov_body_large_down):
284 /* There are at least 128 bytes to move. */
285 ldp A_l, A_h, [src, #0]
286 sub dst, dst, #16 /* Pre-bias. */
287 ldp B_l, B_h, [src, #16]
288 ldp C_l, C_h, [src, #32]
289 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
2901:
291 stp A_l, A_h, [dst, #16]
292 ldp A_l, A_h, [src, #16]
293 stp B_l, B_h, [dst, #32]
294 ldp B_l, B_h, [src, #32]
295 stp C_l, C_h, [dst, #48]
296 ldp C_l, C_h, [src, #48]
297 stp D_l, D_h, [dst, #64]!
298 ldp D_l, D_h, [src, #64]!
299 subs count, count, #64
300 b.ge 1b
301 stp A_l, A_h, [dst, #16]
302 stp B_l, B_h, [dst, #32]
303 stp C_l, C_h, [dst, #48]
304 stp D_l, D_h, [dst, #64]
305 add src, src, #16
306 add dst, dst, #64 + 16
307 tst count, #0x3f
308 b.ne L(tail63down)
309 RET
310END (memmove)
311
312libc_hidden_builtin_def (memmove)