]>
Commit | Line | Data |
---|---|---|
4a733bf3 | 1 | /* Generic optimized memcpy using SIMD. |
581c785b | 2 | Copyright (C) 2020-2022 Free Software Foundation, Inc. |
4a733bf3 WD |
3 | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library. If not, see | |
18 | <https://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include <sysdep.h> | |
21 | ||
22 | /* Assumptions: | |
23 | * | |
24 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. | |
25 | * | |
26 | */ | |
27 | ||
28 | #define dstin x0 | |
29 | #define src x1 | |
30 | #define count x2 | |
31 | #define dst x3 | |
32 | #define srcend x4 | |
33 | #define dstend x5 | |
34 | #define A_l x6 | |
35 | #define A_lw w6 | |
36 | #define A_h x7 | |
37 | #define B_l x8 | |
38 | #define B_lw w8 | |
39 | #define B_h x9 | |
40 | #define C_lw w10 | |
41 | #define tmp1 x14 | |
42 | ||
43 | #define A_q q0 | |
44 | #define B_q q1 | |
45 | #define C_q q2 | |
46 | #define D_q q3 | |
47 | #define E_q q4 | |
48 | #define F_q q5 | |
49 | #define G_q q6 | |
50 | #define H_q q7 | |
51 | ||
52 | ||
53 | /* This implementation supports both memcpy and memmove and shares most code. | |
54 | It uses unaligned accesses and branchless sequences to keep the code small, | |
55 | simple and improve performance. | |
56 | ||
57 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium | |
58 | copies of up to 128 bytes, and large copies. The overhead of the overlap | |
59 | check in memmove is negligible since it is only required for large copies. | |
60 | ||
61 | Large copies use a software pipelined loop processing 64 bytes per | |
62 | iteration. The destination pointer is 16-byte aligned to minimize | |
63 | unaligned accesses. The loop tail is handled by always copying 64 bytes | |
64 | from the end. */ | |
65 | ||
66 | ENTRY (__memcpy_simd) | |
45b1e17e SN |
67 | PTR_ARG (0) |
68 | PTR_ARG (1) | |
69 | SIZE_ARG (2) | |
4a733bf3 WD |
70 | |
71 | add srcend, src, count | |
72 | add dstend, dstin, count | |
73 | cmp count, 128 | |
74 | b.hi L(copy_long) | |
75 | cmp count, 32 | |
76 | b.hi L(copy32_128) | |
77 | ||
78 | /* Small copies: 0..32 bytes. */ | |
79 | cmp count, 16 | |
80 | b.lo L(copy16) | |
81 | ldr A_q, [src] | |
82 | ldr B_q, [srcend, -16] | |
83 | str A_q, [dstin] | |
84 | str B_q, [dstend, -16] | |
85 | ret | |
86 | ||
87 | /* Copy 8-15 bytes. */ | |
88 | L(copy16): | |
89 | tbz count, 3, L(copy8) | |
90 | ldr A_l, [src] | |
91 | ldr A_h, [srcend, -8] | |
92 | str A_l, [dstin] | |
93 | str A_h, [dstend, -8] | |
94 | ret | |
95 | ||
96 | /* Copy 4-7 bytes. */ | |
97 | L(copy8): | |
98 | tbz count, 2, L(copy4) | |
99 | ldr A_lw, [src] | |
100 | ldr B_lw, [srcend, -4] | |
101 | str A_lw, [dstin] | |
102 | str B_lw, [dstend, -4] | |
103 | ret | |
104 | ||
105 | /* Copy 0..3 bytes using a branchless sequence. */ | |
106 | L(copy4): | |
107 | cbz count, L(copy0) | |
108 | lsr tmp1, count, 1 | |
109 | ldrb A_lw, [src] | |
110 | ldrb C_lw, [srcend, -1] | |
111 | ldrb B_lw, [src, tmp1] | |
112 | strb A_lw, [dstin] | |
113 | strb B_lw, [dstin, tmp1] | |
114 | strb C_lw, [dstend, -1] | |
115 | L(copy0): | |
116 | ret | |
117 | ||
118 | .p2align 4 | |
119 | /* Medium copies: 33..128 bytes. */ | |
120 | L(copy32_128): | |
121 | ldp A_q, B_q, [src] | |
122 | ldp C_q, D_q, [srcend, -32] | |
123 | cmp count, 64 | |
124 | b.hi L(copy128) | |
125 | stp A_q, B_q, [dstin] | |
126 | stp C_q, D_q, [dstend, -32] | |
127 | ret | |
128 | ||
129 | .p2align 4 | |
130 | /* Copy 65..128 bytes. */ | |
131 | L(copy128): | |
132 | ldp E_q, F_q, [src, 32] | |
133 | cmp count, 96 | |
134 | b.ls L(copy96) | |
135 | ldp G_q, H_q, [srcend, -64] | |
136 | stp G_q, H_q, [dstend, -64] | |
137 | L(copy96): | |
138 | stp A_q, B_q, [dstin] | |
139 | stp E_q, F_q, [dstin, 32] | |
140 | stp C_q, D_q, [dstend, -32] | |
141 | ret | |
142 | ||
143 | /* Align loop64 below to 16 bytes. */ | |
144 | nop | |
145 | ||
146 | /* Copy more than 128 bytes. */ | |
147 | L(copy_long): | |
148 | /* Copy 16 bytes and then align src to 16-byte alignment. */ | |
149 | ldr D_q, [src] | |
150 | and tmp1, src, 15 | |
151 | bic src, src, 15 | |
152 | sub dst, dstin, tmp1 | |
153 | add count, count, tmp1 /* Count is now 16 too large. */ | |
154 | ldp A_q, B_q, [src, 16] | |
155 | str D_q, [dstin] | |
156 | ldp C_q, D_q, [src, 48] | |
157 | subs count, count, 128 + 16 /* Test and readjust count. */ | |
158 | b.ls L(copy64_from_end) | |
159 | L(loop64): | |
160 | stp A_q, B_q, [dst, 16] | |
161 | ldp A_q, B_q, [src, 80] | |
162 | stp C_q, D_q, [dst, 48] | |
163 | ldp C_q, D_q, [src, 112] | |
164 | add src, src, 64 | |
165 | add dst, dst, 64 | |
166 | subs count, count, 64 | |
167 | b.hi L(loop64) | |
168 | ||
169 | /* Write the last iteration and copy 64 bytes from the end. */ | |
170 | L(copy64_from_end): | |
171 | ldp E_q, F_q, [srcend, -64] | |
172 | stp A_q, B_q, [dst, 16] | |
173 | ldp A_q, B_q, [srcend, -32] | |
174 | stp C_q, D_q, [dst, 48] | |
175 | stp E_q, F_q, [dstend, -64] | |
176 | stp A_q, B_q, [dstend, -32] | |
177 | ret | |
178 | ||
179 | END (__memcpy_simd) | |
180 | libc_hidden_builtin_def (__memcpy_simd) | |
181 | ||
182 | ||
183 | ENTRY (__memmove_simd) | |
45b1e17e SN |
184 | PTR_ARG (0) |
185 | PTR_ARG (1) | |
186 | SIZE_ARG (2) | |
4a733bf3 WD |
187 | |
188 | add srcend, src, count | |
189 | add dstend, dstin, count | |
190 | cmp count, 128 | |
191 | b.hi L(move_long) | |
192 | cmp count, 32 | |
193 | b.hi L(copy32_128) | |
194 | ||
195 | /* Small moves: 0..32 bytes. */ | |
196 | cmp count, 16 | |
197 | b.lo L(copy16) | |
198 | ldr A_q, [src] | |
199 | ldr B_q, [srcend, -16] | |
200 | str A_q, [dstin] | |
201 | str B_q, [dstend, -16] | |
202 | ret | |
203 | ||
204 | L(move_long): | |
205 | /* Only use backward copy if there is an overlap. */ | |
206 | sub tmp1, dstin, src | |
207 | cbz tmp1, L(move0) | |
208 | cmp tmp1, count | |
209 | b.hs L(copy_long) | |
210 | ||
211 | /* Large backwards copy for overlapping copies. | |
212 | Copy 16 bytes and then align srcend to 16-byte alignment. */ | |
213 | L(copy_long_backwards): | |
214 | ldr D_q, [srcend, -16] | |
215 | and tmp1, srcend, 15 | |
216 | bic srcend, srcend, 15 | |
217 | sub count, count, tmp1 | |
218 | ldp A_q, B_q, [srcend, -32] | |
219 | str D_q, [dstend, -16] | |
220 | ldp C_q, D_q, [srcend, -64] | |
221 | sub dstend, dstend, tmp1 | |
222 | subs count, count, 128 | |
223 | b.ls L(copy64_from_start) | |
224 | ||
225 | L(loop64_backwards): | |
bd394d13 WD |
226 | str B_q, [dstend, -16] |
227 | str A_q, [dstend, -32] | |
4a733bf3 | 228 | ldp A_q, B_q, [srcend, -96] |
bd394d13 WD |
229 | str D_q, [dstend, -48] |
230 | str C_q, [dstend, -64]! | |
4a733bf3 WD |
231 | ldp C_q, D_q, [srcend, -128] |
232 | sub srcend, srcend, 64 | |
4a733bf3 WD |
233 | subs count, count, 64 |
234 | b.hi L(loop64_backwards) | |
235 | ||
236 | /* Write the last iteration and copy 64 bytes from the start. */ | |
237 | L(copy64_from_start): | |
238 | ldp E_q, F_q, [src, 32] | |
239 | stp A_q, B_q, [dstend, -32] | |
240 | ldp A_q, B_q, [src] | |
241 | stp C_q, D_q, [dstend, -64] | |
242 | stp E_q, F_q, [dstin, 32] | |
243 | stp A_q, B_q, [dstin] | |
244 | L(move0): | |
245 | ret | |
246 | ||
247 | END (__memmove_simd) | |
248 | libc_hidden_builtin_def (__memmove_simd) |