]>
Commit | Line | Data |
---|---|---|
04277e02 | 1 | /* Copyright (C) 2012-2019 Free Software Foundation, Inc. |
857c8d22 MS |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
b998e16e WD |
19 | #include <sysdep.h> |
20 | ||
857c8d22 MS |
21 | /* Assumptions: |
22 | * | |
b998e16e | 23 | * ARMv8-a, AArch64, unaligned accesses. |
857c8d22 MS |
24 | * |
25 | */ | |
26 | ||
27 | #define dstin x0 | |
28 | #define src x1 | |
29 | #define count x2 | |
b998e16e WD |
30 | #define dst x3 |
31 | #define srcend x4 | |
32 | #define dstend x5 | |
33 | #define A_l x6 | |
34 | #define A_lw w6 | |
35 | #define A_h x7 | |
36 | #define A_hw w7 | |
37 | #define B_l x8 | |
a024b39a | 38 | #define B_lw w8 |
b998e16e WD |
39 | #define B_h x9 |
40 | #define C_l x10 | |
41 | #define C_h x11 | |
42 | #define D_l x12 | |
43 | #define D_h x13 | |
44 | #define E_l src | |
45 | #define E_h count | |
46 | #define F_l srcend | |
47 | #define F_h dst | |
48 | #define G_l count | |
49 | #define G_h dst | |
50 | #define tmp1 x14 | |
857c8d22 | 51 | |
b998e16e WD |
52 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, |
53 | medium copies of 17..96 bytes which are fully unrolled. Large copies | |
54 | of more than 96 bytes align the destination and use an unrolled loop | |
55 | processing 64 bytes per iteration. | |
56 | In order to share code with memmove, small and medium copies read all | |
57 | data before writing, allowing any kind of overlap. So small, medium | |
58 | and large backwards memmoves are handled by falling through into memcpy. | |
59 | Overlapping large forward memmoves use a loop that copies backwards. | |
60 | */ | |
857c8d22 | 61 | |
6a2c6952 SE |
62 | #ifndef MEMMOVE |
63 | # define MEMMOVE memmove | |
64 | #endif | |
65 | #ifndef MEMCPY | |
66 | # define MEMCPY memcpy | |
67 | #endif | |
68 | ||
69 | ENTRY_ALIGN (MEMMOVE, 6) | |
b998e16e | 70 | |
389d1f1b SE |
71 | DELOUSE (0) |
72 | DELOUSE (1) | |
73 | DELOUSE (2) | |
74 | ||
b998e16e WD |
75 | sub tmp1, dstin, src |
76 | cmp count, 96 | |
77 | ccmp tmp1, count, 2, hi | |
78 | b.lo L(move_long) | |
79 | ||
80 | /* Common case falls through into memcpy. */ | |
6a2c6952 SE |
81 | END (MEMMOVE) |
82 | libc_hidden_builtin_def (MEMMOVE) | |
83 | ENTRY (MEMCPY) | |
b998e16e | 84 | |
389d1f1b SE |
85 | DELOUSE (0) |
86 | DELOUSE (1) | |
87 | DELOUSE (2) | |
88 | ||
a024b39a | 89 | prfm PLDL1KEEP, [src] |
b998e16e WD |
90 | add srcend, src, count |
91 | add dstend, dstin, count | |
a024b39a WD |
92 | cmp count, 16 |
93 | b.ls L(copy16) | |
b998e16e WD |
94 | cmp count, 96 |
95 | b.hi L(copy_long) | |
b998e16e | 96 | |
a024b39a WD |
97 | /* Medium copies: 17..96 bytes. */ |
98 | sub tmp1, count, 1 | |
99 | ldp A_l, A_h, [src] | |
100 | tbnz tmp1, 6, L(copy96) | |
101 | ldp D_l, D_h, [srcend, -16] | |
102 | tbz tmp1, 5, 1f | |
103 | ldp B_l, B_h, [src, 16] | |
104 | ldp C_l, C_h, [srcend, -32] | |
105 | stp B_l, B_h, [dstin, 16] | |
106 | stp C_l, C_h, [dstend, -32] | |
107 | 1: | |
108 | stp A_l, A_h, [dstin] | |
109 | stp D_l, D_h, [dstend, -16] | |
110 | ret | |
111 | ||
112 | .p2align 4 | |
b998e16e WD |
113 | /* Small copies: 0..16 bytes. */ |
114 | L(copy16): | |
a024b39a WD |
115 | cmp count, 8 |
116 | b.lo 1f | |
b998e16e WD |
117 | ldr A_l, [src] |
118 | ldr A_h, [srcend, -8] | |
119 | str A_l, [dstin] | |
120 | str A_h, [dstend, -8] | |
121 | ret | |
a024b39a | 122 | .p2align 4 |
857c8d22 | 123 | 1: |
b998e16e WD |
124 | tbz count, 2, 1f |
125 | ldr A_lw, [src] | |
126 | ldr A_hw, [srcend, -4] | |
127 | str A_lw, [dstin] | |
128 | str A_hw, [dstend, -4] | |
129 | ret | |
a024b39a WD |
130 | |
131 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same | |
132 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ | |
857c8d22 | 133 | 1: |
b998e16e | 134 | cbz count, 2f |
a024b39a | 135 | lsr tmp1, count, 1 |
b998e16e | 136 | ldrb A_lw, [src] |
a024b39a WD |
137 | ldrb A_hw, [srcend, -1] |
138 | ldrb B_lw, [src, tmp1] | |
139 | strb A_lw, [dstin] | |
140 | strb B_lw, [dstin, tmp1] | |
141 | strb A_hw, [dstend, -1] | |
b998e16e WD |
142 | 2: ret |
143 | ||
b998e16e WD |
144 | .p2align 4 |
145 | /* Copy 64..96 bytes. Copy 64 bytes from the start and | |
146 | 32 bytes from the end. */ | |
147 | L(copy96): | |
148 | ldp B_l, B_h, [src, 16] | |
149 | ldp C_l, C_h, [src, 32] | |
150 | ldp D_l, D_h, [src, 48] | |
151 | ldp E_l, E_h, [srcend, -32] | |
152 | ldp F_l, F_h, [srcend, -16] | |
153 | stp A_l, A_h, [dstin] | |
154 | stp B_l, B_h, [dstin, 16] | |
155 | stp C_l, C_h, [dstin, 32] | |
156 | stp D_l, D_h, [dstin, 48] | |
157 | stp E_l, E_h, [dstend, -32] | |
158 | stp F_l, F_h, [dstend, -16] | |
159 | ret | |
160 | ||
161 | /* Align DST to 16 byte alignment so that we don't cross cache line | |
162 | boundaries on both loads and stores. There are at least 96 bytes | |
163 | to copy, so copy 16 bytes unaligned and then align. The loop | |
164 | copies 64 bytes per iteration and prefetches one iteration ahead. */ | |
165 | ||
166 | .p2align 4 | |
167 | L(copy_long): | |
168 | and tmp1, dstin, 15 | |
169 | bic dst, dstin, 15 | |
170 | ldp D_l, D_h, [src] | |
171 | sub src, src, tmp1 | |
172 | add count, count, tmp1 /* Count is now 16 too large. */ | |
173 | ldp A_l, A_h, [src, 16] | |
174 | stp D_l, D_h, [dstin] | |
175 | ldp B_l, B_h, [src, 32] | |
176 | ldp C_l, C_h, [src, 48] | |
177 | ldp D_l, D_h, [src, 64]! | |
178 | subs count, count, 128 + 16 /* Test and readjust count. */ | |
6a2c6952 SE |
179 | b.ls L(last64) |
180 | L(loop64): | |
b998e16e WD |
181 | stp A_l, A_h, [dst, 16] |
182 | ldp A_l, A_h, [src, 16] | |
183 | stp B_l, B_h, [dst, 32] | |
184 | ldp B_l, B_h, [src, 32] | |
185 | stp C_l, C_h, [dst, 48] | |
186 | ldp C_l, C_h, [src, 48] | |
187 | stp D_l, D_h, [dst, 64]! | |
188 | ldp D_l, D_h, [src, 64]! | |
189 | subs count, count, 64 | |
6a2c6952 | 190 | b.hi L(loop64) |
b998e16e WD |
191 | |
192 | /* Write the last full set of 64 bytes. The remainder is at most 64 | |
193 | bytes, so it is safe to always copy 64 bytes from the end even if | |
194 | there is just 1 byte left. */ | |
6a2c6952 | 195 | L(last64): |
b998e16e WD |
196 | ldp E_l, E_h, [srcend, -64] |
197 | stp A_l, A_h, [dst, 16] | |
198 | ldp A_l, A_h, [srcend, -48] | |
199 | stp B_l, B_h, [dst, 32] | |
200 | ldp B_l, B_h, [srcend, -32] | |
201 | stp C_l, C_h, [dst, 48] | |
202 | ldp C_l, C_h, [srcend, -16] | |
203 | stp D_l, D_h, [dst, 64] | |
204 | stp E_l, E_h, [dstend, -64] | |
205 | stp A_l, A_h, [dstend, -48] | |
206 | stp B_l, B_h, [dstend, -32] | |
207 | stp C_l, C_h, [dstend, -16] | |
208 | ret | |
209 | ||
210 | .p2align 4 | |
211 | L(move_long): | |
212 | cbz tmp1, 3f | |
213 | ||
214 | add srcend, src, count | |
215 | add dstend, dstin, count | |
216 | ||
217 | /* Align dstend to 16 byte alignment so that we don't cross cache line | |
218 | boundaries on both loads and stores. There are at least 96 bytes | |
219 | to copy, so copy 16 bytes unaligned and then align. The loop | |
220 | copies 64 bytes per iteration and prefetches one iteration ahead. */ | |
221 | ||
222 | and tmp1, dstend, 15 | |
223 | ldp D_l, D_h, [srcend, -16] | |
224 | sub srcend, srcend, tmp1 | |
225 | sub count, count, tmp1 | |
226 | ldp A_l, A_h, [srcend, -16] | |
227 | stp D_l, D_h, [dstend, -16] | |
228 | ldp B_l, B_h, [srcend, -32] | |
229 | ldp C_l, C_h, [srcend, -48] | |
230 | ldp D_l, D_h, [srcend, -64]! | |
231 | sub dstend, dstend, tmp1 | |
232 | subs count, count, 128 | |
233 | b.ls 2f | |
234 | ||
235 | nop | |
857c8d22 | 236 | 1: |
b998e16e WD |
237 | stp A_l, A_h, [dstend, -16] |
238 | ldp A_l, A_h, [srcend, -16] | |
239 | stp B_l, B_h, [dstend, -32] | |
240 | ldp B_l, B_h, [srcend, -32] | |
241 | stp C_l, C_h, [dstend, -48] | |
242 | ldp C_l, C_h, [srcend, -48] | |
243 | stp D_l, D_h, [dstend, -64]! | |
244 | ldp D_l, D_h, [srcend, -64]! | |
245 | subs count, count, 64 | |
246 | b.hi 1b | |
247 | ||
248 | /* Write the last full set of 64 bytes. The remainder is at most 64 | |
249 | bytes, so it is safe to always copy 64 bytes from the start even if | |
250 | there is just 1 byte left. */ | |
251 | 2: | |
252 | ldp G_l, G_h, [src, 48] | |
253 | stp A_l, A_h, [dstend, -16] | |
254 | ldp A_l, A_h, [src, 32] | |
255 | stp B_l, B_h, [dstend, -32] | |
256 | ldp B_l, B_h, [src, 16] | |
257 | stp C_l, C_h, [dstend, -48] | |
258 | ldp C_l, C_h, [src] | |
259 | stp D_l, D_h, [dstend, -64] | |
260 | stp G_l, G_h, [dstin, 48] | |
261 | stp A_l, A_h, [dstin, 32] | |
262 | stp B_l, B_h, [dstin, 16] | |
263 | stp C_l, C_h, [dstin] | |
264 | 3: ret | |
265 | ||
6a2c6952 SE |
266 | END (MEMCPY) |
267 | libc_hidden_builtin_def (MEMCPY) |