]>
Commit | Line | Data |
---|---|---|
b168057a | 1 | /* Copyright (C) 2012-2015 Free Software Foundation, Inc. |
c2b6221e MS |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | /* Assumptions: | |
22 | * | |
23 | * ARMv8-a, AArch64 | |
24 | * Unaligned accesses | |
25 | */ | |
26 | ||
27 | /* Parameters and result. */ | |
28 | #define dstin x0 | |
29 | #define src x1 | |
30 | #define count x2 | |
31 | #define tmp1 x3 | |
32 | #define tmp1w w3 | |
33 | #define tmp2 x4 | |
34 | #define tmp2w w4 | |
35 | #define tmp3 x5 | |
36 | #define tmp3w w5 | |
37 | #define dst x6 | |
38 | ||
39 | #define A_l x7 | |
40 | #define A_h x8 | |
41 | #define B_l x9 | |
42 | #define B_h x10 | |
43 | #define C_l x11 | |
44 | #define C_h x12 | |
45 | #define D_l x13 | |
46 | #define D_h x14 | |
47 | ||
48 | ENTRY_ALIGN (memmove, 6) | |
49 | ||
50 | cmp dstin, src | |
51 | b.lo L(downwards) | |
52 | add tmp1, src, count | |
53 | cmp dstin, tmp1 | |
54 | b.hs memcpy /* No overlap. */ | |
55 | ||
56 | /* Upwards move with potential overlap. | |
57 | * Need to move from the tail backwards. SRC and DST point one | |
58 | * byte beyond the remaining data to move. */ | |
59 | add dst, dstin, count | |
60 | add src, src, count | |
61 | cmp count, #64 | |
62 | b.ge L(mov_not_short_up) | |
63 | ||
64 | /* Deal with small moves quickly by dropping straight into the | |
65 | * exit block. */ | |
66 | L(tail63up): | |
67 | /* Move up to 48 bytes of data. At this point we only need the | |
68 | * bottom 6 bits of count to be accurate. */ | |
69 | ands tmp1, count, #0x30 | |
70 | b.eq L(tail15up) | |
71 | sub dst, dst, tmp1 | |
72 | sub src, src, tmp1 | |
73 | cmp tmp1w, #0x20 | |
74 | b.eq 1f | |
75 | b.lt 2f | |
76 | ldp A_l, A_h, [src, #32] | |
77 | stp A_l, A_h, [dst, #32] | |
78 | 1: | |
79 | ldp A_l, A_h, [src, #16] | |
80 | stp A_l, A_h, [dst, #16] | |
81 | 2: | |
82 | ldp A_l, A_h, [src] | |
83 | stp A_l, A_h, [dst] | |
84 | L(tail15up): | |
85 | /* Move up to 15 bytes of data. Does not assume additional data | |
86 | * being moved. */ | |
87 | tbz count, #3, 1f | |
88 | ldr tmp1, [src, #-8]! | |
89 | str tmp1, [dst, #-8]! | |
90 | 1: | |
91 | tbz count, #2, 1f | |
92 | ldr tmp1w, [src, #-4]! | |
93 | str tmp1w, [dst, #-4]! | |
94 | 1: | |
95 | tbz count, #1, 1f | |
96 | ldrh tmp1w, [src, #-2]! | |
97 | strh tmp1w, [dst, #-2]! | |
98 | 1: | |
99 | tbz count, #0, 1f | |
100 | ldrb tmp1w, [src, #-1] | |
101 | strb tmp1w, [dst, #-1] | |
102 | 1: | |
103 | RET | |
104 | ||
105 | L(mov_not_short_up): | |
106 | /* We don't much care about the alignment of DST, but we want SRC | |
107 | * to be 128-bit (16 byte) aligned so that we don't cross cache line | |
108 | * boundaries on both loads and stores. */ | |
109 | ands tmp2, src, #15 /* Bytes to reach alignment. */ | |
110 | b.eq 2f | |
111 | sub count, count, tmp2 | |
112 | /* Move enough data to reach alignment; unlike memcpy, we have to | |
113 | * be aware of the overlap, which means we can't move data twice. */ | |
114 | tbz tmp2, #3, 1f | |
115 | ldr tmp1, [src, #-8]! | |
116 | str tmp1, [dst, #-8]! | |
117 | 1: | |
118 | tbz tmp2, #2, 1f | |
119 | ldr tmp1w, [src, #-4]! | |
120 | str tmp1w, [dst, #-4]! | |
121 | 1: | |
122 | tbz tmp2, #1, 1f | |
123 | ldrh tmp1w, [src, #-2]! | |
124 | strh tmp1w, [dst, #-2]! | |
125 | 1: | |
126 | tbz tmp2, #0, 1f | |
127 | ldrb tmp1w, [src, #-1]! | |
128 | strb tmp1w, [dst, #-1]! | |
129 | 1: | |
130 | ||
131 | /* There may be less than 63 bytes to go now. */ | |
132 | cmp count, #63 | |
133 | b.le L(tail63up) | |
134 | 2: | |
135 | subs count, count, #128 | |
136 | b.ge L(mov_body_large_up) | |
137 | /* Less than 128 bytes to move, so handle 64 here and then jump | |
138 | * to the tail. */ | |
139 | ldp A_l, A_h, [src, #-64]! | |
140 | ldp B_l, B_h, [src, #16] | |
141 | ldp C_l, C_h, [src, #32] | |
142 | ldp D_l, D_h, [src, #48] | |
143 | stp A_l, A_h, [dst, #-64]! | |
144 | stp B_l, B_h, [dst, #16] | |
145 | stp C_l, C_h, [dst, #32] | |
146 | stp D_l, D_h, [dst, #48] | |
147 | tst count, #0x3f | |
148 | b.ne L(tail63up) | |
149 | RET | |
150 | ||
151 | /* Critical loop. Start at a new Icache line boundary. Assuming | |
152 | * 64 bytes per line this ensures the entire loop is in one line. */ | |
153 | .p2align 6 | |
154 | L(mov_body_large_up): | |
155 | /* There are at least 128 bytes to move. */ | |
156 | ldp A_l, A_h, [src, #-16] | |
157 | ldp B_l, B_h, [src, #-32] | |
158 | ldp C_l, C_h, [src, #-48] | |
159 | ldp D_l, D_h, [src, #-64]! | |
160 | 1: | |
161 | stp A_l, A_h, [dst, #-16] | |
162 | ldp A_l, A_h, [src, #-16] | |
163 | stp B_l, B_h, [dst, #-32] | |
164 | ldp B_l, B_h, [src, #-32] | |
165 | stp C_l, C_h, [dst, #-48] | |
166 | ldp C_l, C_h, [src, #-48] | |
167 | stp D_l, D_h, [dst, #-64]! | |
168 | ldp D_l, D_h, [src, #-64]! | |
169 | subs count, count, #64 | |
170 | b.ge 1b | |
171 | stp A_l, A_h, [dst, #-16] | |
172 | stp B_l, B_h, [dst, #-32] | |
173 | stp C_l, C_h, [dst, #-48] | |
174 | stp D_l, D_h, [dst, #-64]! | |
175 | tst count, #0x3f | |
176 | b.ne L(tail63up) | |
177 | RET | |
178 | ||
179 | L(downwards): | |
180 | /* For a downwards move we can safely use memcpy provided that | |
181 | * DST is more than 16 bytes away from SRC. */ | |
182 | sub tmp1, src, #16 | |
183 | cmp dstin, tmp1 | |
184 | b.ls memcpy /* May overlap, but not critically. */ | |
185 | ||
186 | mov dst, dstin /* Preserve DSTIN for return value. */ | |
187 | cmp count, #64 | |
188 | b.ge L(mov_not_short_down) | |
189 | ||
190 | /* Deal with small moves quickly by dropping straight into the | |
191 | * exit block. */ | |
192 | L(tail63down): | |
193 | /* Move up to 48 bytes of data. At this point we only need the | |
194 | * bottom 6 bits of count to be accurate. */ | |
195 | ands tmp1, count, #0x30 | |
196 | b.eq L(tail15down) | |
197 | add dst, dst, tmp1 | |
198 | add src, src, tmp1 | |
199 | cmp tmp1w, #0x20 | |
200 | b.eq 1f | |
201 | b.lt 2f | |
202 | ldp A_l, A_h, [src, #-48] | |
203 | stp A_l, A_h, [dst, #-48] | |
204 | 1: | |
205 | ldp A_l, A_h, [src, #-32] | |
206 | stp A_l, A_h, [dst, #-32] | |
207 | 2: | |
208 | ldp A_l, A_h, [src, #-16] | |
209 | stp A_l, A_h, [dst, #-16] | |
210 | L(tail15down): | |
211 | /* Move up to 15 bytes of data. Does not assume additional data | |
212 | being moved. */ | |
213 | tbz count, #3, 1f | |
214 | ldr tmp1, [src], #8 | |
215 | str tmp1, [dst], #8 | |
216 | 1: | |
217 | tbz count, #2, 1f | |
218 | ldr tmp1w, [src], #4 | |
219 | str tmp1w, [dst], #4 | |
220 | 1: | |
221 | tbz count, #1, 1f | |
222 | ldrh tmp1w, [src], #2 | |
223 | strh tmp1w, [dst], #2 | |
224 | 1: | |
225 | tbz count, #0, 1f | |
226 | ldrb tmp1w, [src] | |
227 | strb tmp1w, [dst] | |
228 | 1: | |
229 | RET | |
230 | ||
231 | L(mov_not_short_down): | |
232 | /* We don't much care about the alignment of DST, but we want SRC | |
233 | * to be 128-bit (16 byte) aligned so that we don't cross cache line | |
234 | * boundaries on both loads and stores. */ | |
235 | neg tmp2, src | |
236 | ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ | |
237 | b.eq 2f | |
238 | sub count, count, tmp2 | |
239 | /* Move enough data to reach alignment; unlike memcpy, we have to | |
240 | * be aware of the overlap, which means we can't move data twice. */ | |
241 | tbz tmp2, #3, 1f | |
242 | ldr tmp1, [src], #8 | |
243 | str tmp1, [dst], #8 | |
244 | 1: | |
245 | tbz tmp2, #2, 1f | |
246 | ldr tmp1w, [src], #4 | |
247 | str tmp1w, [dst], #4 | |
248 | 1: | |
249 | tbz tmp2, #1, 1f | |
250 | ldrh tmp1w, [src], #2 | |
251 | strh tmp1w, [dst], #2 | |
252 | 1: | |
253 | tbz tmp2, #0, 1f | |
254 | ldrb tmp1w, [src], #1 | |
255 | strb tmp1w, [dst], #1 | |
256 | 1: | |
257 | ||
258 | /* There may be less than 63 bytes to go now. */ | |
259 | cmp count, #63 | |
260 | b.le L(tail63down) | |
261 | 2: | |
262 | subs count, count, #128 | |
263 | b.ge L(mov_body_large_down) | |
264 | /* Less than 128 bytes to move, so handle 64 here and then jump | |
265 | * to the tail. */ | |
266 | ldp A_l, A_h, [src] | |
267 | ldp B_l, B_h, [src, #16] | |
268 | ldp C_l, C_h, [src, #32] | |
269 | ldp D_l, D_h, [src, #48] | |
270 | stp A_l, A_h, [dst] | |
271 | stp B_l, B_h, [dst, #16] | |
272 | stp C_l, C_h, [dst, #32] | |
273 | stp D_l, D_h, [dst, #48] | |
274 | tst count, #0x3f | |
275 | add src, src, #64 | |
276 | add dst, dst, #64 | |
277 | b.ne L(tail63down) | |
278 | RET | |
279 | ||
280 | /* Critical loop. Start at a new cache line boundary. Assuming | |
281 | * 64 bytes per line this ensures the entire loop is in one line. */ | |
282 | .p2align 6 | |
283 | L(mov_body_large_down): | |
284 | /* There are at least 128 bytes to move. */ | |
285 | ldp A_l, A_h, [src, #0] | |
286 | sub dst, dst, #16 /* Pre-bias. */ | |
287 | ldp B_l, B_h, [src, #16] | |
288 | ldp C_l, C_h, [src, #32] | |
289 | ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ | |
290 | 1: | |
291 | stp A_l, A_h, [dst, #16] | |
292 | ldp A_l, A_h, [src, #16] | |
293 | stp B_l, B_h, [dst, #32] | |
294 | ldp B_l, B_h, [src, #32] | |
295 | stp C_l, C_h, [dst, #48] | |
296 | ldp C_l, C_h, [src, #48] | |
297 | stp D_l, D_h, [dst, #64]! | |
298 | ldp D_l, D_h, [src, #64]! | |
299 | subs count, count, #64 | |
300 | b.ge 1b | |
301 | stp A_l, A_h, [dst, #16] | |
302 | stp B_l, B_h, [dst, #32] | |
303 | stp C_l, C_h, [dst, #48] | |
304 | stp D_l, D_h, [dst, #64] | |
305 | add src, src, #16 | |
306 | add dst, dst, #64 + 16 | |
307 | tst count, #0x3f | |
308 | b.ne L(tail63down) | |
309 | RET | |
310 | END (memmove) | |
311 | ||
312 | libc_hidden_builtin_def (memmove) |