]>
Commit | Line | Data |
---|---|---|
6a2c6952 | 1 | /* A Thunderx Optimized memcpy implementation for AARCH64. |
2b778ceb | 2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
6a2c6952 SE |
3 | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, see | |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
6a2c6952 SE |
19 | |
20 | /* The actual code in this memcpy and memmove should be identical to the | |
21 | generic version except for the code under '#ifdef THUNDERX'. This is | |
22 | to make is easier to keep this version and the generic version in sync | |
23 | for changes that are not specific to thunderx. */ | |
24 | ||
25 | #include <sysdep.h> | |
26 | ||
27 | /* Assumptions: | |
28 | * | |
29 | * ARMv8-a, AArch64, unaligned accesses. | |
30 | * | |
31 | */ | |
32 | ||
33 | #define dstin x0 | |
34 | #define src x1 | |
35 | #define count x2 | |
36 | #define dst x3 | |
37 | #define srcend x4 | |
38 | #define dstend x5 | |
39 | #define A_l x6 | |
40 | #define A_lw w6 | |
41 | #define A_h x7 | |
42 | #define A_hw w7 | |
43 | #define B_l x8 | |
44 | #define B_lw w8 | |
45 | #define B_h x9 | |
46 | #define C_l x10 | |
47 | #define C_h x11 | |
48 | #define D_l x12 | |
49 | #define D_h x13 | |
50 | #define E_l src | |
51 | #define E_h count | |
52 | #define F_l srcend | |
53 | #define F_h dst | |
54 | #define G_l count | |
55 | #define G_h dst | |
56 | #define tmp1 x14 | |
57 | ||
58 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, | |
59 | medium copies of 17..96 bytes which are fully unrolled. Large copies | |
60 | of more than 96 bytes align the destination and use an unrolled loop | |
61 | processing 64 bytes per iteration. | |
62 | In order to share code with memmove, small and medium copies read all | |
63 | data before writing, allowing any kind of overlap. So small, medium | |
64 | and large backwards memmoves are handled by falling through into memcpy. | |
65 | Overlapping large forward memmoves use a loop that copies backwards. | |
66 | */ | |
67 | ||
68 | #ifndef MEMMOVE | |
69 | # define MEMMOVE memmove | |
70 | #endif | |
71 | #ifndef MEMCPY | |
72 | # define MEMCPY memcpy | |
73 | #endif | |
74 | ||
75 | #if IS_IN (libc) | |
76 | ||
e9537ddd SE |
77 | # undef MEMCPY |
78 | # define MEMCPY __memcpy_thunderx | |
79 | # undef MEMMOVE | |
80 | # define MEMMOVE __memmove_thunderx | |
6a2c6952 SE |
81 | |
82 | ENTRY_ALIGN (MEMMOVE, 6) | |
83 | ||
45b1e17e SN |
84 | PTR_ARG (0) |
85 | PTR_ARG (1) | |
86 | SIZE_ARG (2) | |
6a2c6952 SE |
87 | |
88 | sub tmp1, dstin, src | |
89 | cmp count, 96 | |
90 | ccmp tmp1, count, 2, hi | |
91 | b.lo L(move_long) | |
92 | ||
93 | /* Common case falls through into memcpy. */ | |
94 | END (MEMMOVE) | |
95 | libc_hidden_builtin_def (MEMMOVE) | |
96 | ENTRY (MEMCPY) | |
97 | ||
45b1e17e SN |
98 | PTR_ARG (0) |
99 | PTR_ARG (1) | |
100 | SIZE_ARG (2) | |
6a2c6952 SE |
101 | |
102 | prfm PLDL1KEEP, [src] | |
103 | add srcend, src, count | |
104 | add dstend, dstin, count | |
105 | cmp count, 16 | |
106 | b.ls L(copy16) | |
107 | cmp count, 96 | |
108 | b.hi L(copy_long) | |
109 | ||
110 | /* Medium copies: 17..96 bytes. */ | |
111 | sub tmp1, count, 1 | |
112 | ldp A_l, A_h, [src] | |
113 | tbnz tmp1, 6, L(copy96) | |
114 | ldp D_l, D_h, [srcend, -16] | |
115 | tbz tmp1, 5, 1f | |
116 | ldp B_l, B_h, [src, 16] | |
117 | ldp C_l, C_h, [srcend, -32] | |
118 | stp B_l, B_h, [dstin, 16] | |
119 | stp C_l, C_h, [dstend, -32] | |
120 | 1: | |
121 | stp A_l, A_h, [dstin] | |
122 | stp D_l, D_h, [dstend, -16] | |
123 | ret | |
124 | ||
125 | .p2align 4 | |
126 | /* Small copies: 0..16 bytes. */ | |
127 | L(copy16): | |
128 | cmp count, 8 | |
129 | b.lo 1f | |
130 | ldr A_l, [src] | |
131 | ldr A_h, [srcend, -8] | |
132 | str A_l, [dstin] | |
133 | str A_h, [dstend, -8] | |
134 | ret | |
135 | .p2align 4 | |
136 | 1: | |
137 | tbz count, 2, 1f | |
138 | ldr A_lw, [src] | |
139 | ldr A_hw, [srcend, -4] | |
140 | str A_lw, [dstin] | |
141 | str A_hw, [dstend, -4] | |
142 | ret | |
143 | ||
144 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same | |
145 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ | |
146 | 1: | |
147 | cbz count, 2f | |
148 | lsr tmp1, count, 1 | |
149 | ldrb A_lw, [src] | |
150 | ldrb A_hw, [srcend, -1] | |
151 | ldrb B_lw, [src, tmp1] | |
152 | strb A_lw, [dstin] | |
153 | strb B_lw, [dstin, tmp1] | |
154 | strb A_hw, [dstend, -1] | |
155 | 2: ret | |
156 | ||
157 | .p2align 4 | |
158 | /* Copy 64..96 bytes. Copy 64 bytes from the start and | |
159 | 32 bytes from the end. */ | |
160 | L(copy96): | |
161 | ldp B_l, B_h, [src, 16] | |
162 | ldp C_l, C_h, [src, 32] | |
163 | ldp D_l, D_h, [src, 48] | |
164 | ldp E_l, E_h, [srcend, -32] | |
165 | ldp F_l, F_h, [srcend, -16] | |
166 | stp A_l, A_h, [dstin] | |
167 | stp B_l, B_h, [dstin, 16] | |
168 | stp C_l, C_h, [dstin, 32] | |
169 | stp D_l, D_h, [dstin, 48] | |
170 | stp E_l, E_h, [dstend, -32] | |
171 | stp F_l, F_h, [dstend, -16] | |
172 | ret | |
173 | ||
174 | /* Align DST to 16 byte alignment so that we don't cross cache line | |
175 | boundaries on both loads and stores. There are at least 96 bytes | |
176 | to copy, so copy 16 bytes unaligned and then align. The loop | |
177 | copies 64 bytes per iteration and prefetches one iteration ahead. */ | |
178 | ||
179 | .p2align 4 | |
180 | L(copy_long): | |
181 | ||
6a2c6952 SE |
182 | /* On thunderx, large memcpy's are helped by software prefetching. |
183 | This loop is identical to the one below it but with prefetching | |
184 | instructions included. For loops that are less than 32768 bytes, | |
185 | the prefetching does not help and slow the code down so we only | |
186 | use the prefetching loop for the largest memcpys. */ | |
187 | ||
188 | cmp count, #32768 | |
189 | b.lo L(copy_long_without_prefetch) | |
190 | and tmp1, dstin, 15 | |
191 | bic dst, dstin, 15 | |
192 | ldp D_l, D_h, [src] | |
193 | sub src, src, tmp1 | |
194 | prfm pldl1strm, [src, 384] | |
195 | add count, count, tmp1 /* Count is now 16 too large. */ | |
196 | ldp A_l, A_h, [src, 16] | |
197 | stp D_l, D_h, [dstin] | |
198 | ldp B_l, B_h, [src, 32] | |
199 | ldp C_l, C_h, [src, 48] | |
200 | ldp D_l, D_h, [src, 64]! | |
201 | subs count, count, 128 + 16 /* Test and readjust count. */ | |
202 | ||
203 | L(prefetch_loop64): | |
204 | tbz src, #6, 1f | |
205 | prfm pldl1strm, [src, 512] | |
206 | 1: | |
207 | stp A_l, A_h, [dst, 16] | |
208 | ldp A_l, A_h, [src, 16] | |
209 | stp B_l, B_h, [dst, 32] | |
210 | ldp B_l, B_h, [src, 32] | |
211 | stp C_l, C_h, [dst, 48] | |
212 | ldp C_l, C_h, [src, 48] | |
213 | stp D_l, D_h, [dst, 64]! | |
214 | ldp D_l, D_h, [src, 64]! | |
215 | subs count, count, 64 | |
216 | b.hi L(prefetch_loop64) | |
217 | b L(last64) | |
218 | ||
219 | L(copy_long_without_prefetch): | |
6a2c6952 SE |
220 | |
221 | and tmp1, dstin, 15 | |
222 | bic dst, dstin, 15 | |
223 | ldp D_l, D_h, [src] | |
224 | sub src, src, tmp1 | |
225 | add count, count, tmp1 /* Count is now 16 too large. */ | |
226 | ldp A_l, A_h, [src, 16] | |
227 | stp D_l, D_h, [dstin] | |
228 | ldp B_l, B_h, [src, 32] | |
229 | ldp C_l, C_h, [src, 48] | |
230 | ldp D_l, D_h, [src, 64]! | |
231 | subs count, count, 128 + 16 /* Test and readjust count. */ | |
232 | b.ls L(last64) | |
233 | L(loop64): | |
234 | stp A_l, A_h, [dst, 16] | |
235 | ldp A_l, A_h, [src, 16] | |
236 | stp B_l, B_h, [dst, 32] | |
237 | ldp B_l, B_h, [src, 32] | |
238 | stp C_l, C_h, [dst, 48] | |
239 | ldp C_l, C_h, [src, 48] | |
240 | stp D_l, D_h, [dst, 64]! | |
241 | ldp D_l, D_h, [src, 64]! | |
242 | subs count, count, 64 | |
243 | b.hi L(loop64) | |
244 | ||
245 | /* Write the last full set of 64 bytes. The remainder is at most 64 | |
246 | bytes, so it is safe to always copy 64 bytes from the end even if | |
247 | there is just 1 byte left. */ | |
248 | L(last64): | |
249 | ldp E_l, E_h, [srcend, -64] | |
250 | stp A_l, A_h, [dst, 16] | |
251 | ldp A_l, A_h, [srcend, -48] | |
252 | stp B_l, B_h, [dst, 32] | |
253 | ldp B_l, B_h, [srcend, -32] | |
254 | stp C_l, C_h, [dst, 48] | |
255 | ldp C_l, C_h, [srcend, -16] | |
256 | stp D_l, D_h, [dst, 64] | |
257 | stp E_l, E_h, [dstend, -64] | |
258 | stp A_l, A_h, [dstend, -48] | |
259 | stp B_l, B_h, [dstend, -32] | |
260 | stp C_l, C_h, [dstend, -16] | |
261 | ret | |
262 | ||
263 | .p2align 4 | |
264 | L(move_long): | |
265 | cbz tmp1, 3f | |
266 | ||
267 | add srcend, src, count | |
268 | add dstend, dstin, count | |
269 | ||
270 | /* Align dstend to 16 byte alignment so that we don't cross cache line | |
271 | boundaries on both loads and stores. There are at least 96 bytes | |
272 | to copy, so copy 16 bytes unaligned and then align. The loop | |
273 | copies 64 bytes per iteration and prefetches one iteration ahead. */ | |
274 | ||
275 | and tmp1, dstend, 15 | |
276 | ldp D_l, D_h, [srcend, -16] | |
277 | sub srcend, srcend, tmp1 | |
278 | sub count, count, tmp1 | |
279 | ldp A_l, A_h, [srcend, -16] | |
280 | stp D_l, D_h, [dstend, -16] | |
281 | ldp B_l, B_h, [srcend, -32] | |
282 | ldp C_l, C_h, [srcend, -48] | |
283 | ldp D_l, D_h, [srcend, -64]! | |
284 | sub dstend, dstend, tmp1 | |
285 | subs count, count, 128 | |
286 | b.ls 2f | |
287 | ||
288 | nop | |
289 | 1: | |
290 | stp A_l, A_h, [dstend, -16] | |
291 | ldp A_l, A_h, [srcend, -16] | |
292 | stp B_l, B_h, [dstend, -32] | |
293 | ldp B_l, B_h, [srcend, -32] | |
294 | stp C_l, C_h, [dstend, -48] | |
295 | ldp C_l, C_h, [srcend, -48] | |
296 | stp D_l, D_h, [dstend, -64]! | |
297 | ldp D_l, D_h, [srcend, -64]! | |
298 | subs count, count, 64 | |
299 | b.hi 1b | |
300 | ||
301 | /* Write the last full set of 64 bytes. The remainder is at most 64 | |
302 | bytes, so it is safe to always copy 64 bytes from the start even if | |
303 | there is just 1 byte left. */ | |
304 | 2: | |
305 | ldp G_l, G_h, [src, 48] | |
306 | stp A_l, A_h, [dstend, -16] | |
307 | ldp A_l, A_h, [src, 32] | |
308 | stp B_l, B_h, [dstend, -32] | |
309 | ldp B_l, B_h, [src, 16] | |
310 | stp C_l, C_h, [dstend, -48] | |
311 | ldp C_l, C_h, [src] | |
312 | stp D_l, D_h, [dstend, -64] | |
313 | stp G_l, G_h, [dstin, 48] | |
314 | stp A_l, A_h, [dstin, 32] | |
315 | stp B_l, B_h, [dstin, 16] | |
316 | stp C_l, C_h, [dstin] | |
317 | 3: ret | |
318 | ||
319 | END (MEMCPY) | |
320 | libc_hidden_builtin_def (MEMCPY) | |
321 | ||
322 | #endif |