]>
Commit | Line | Data |
---|---|---|
688903eb | 1 | /* Copyright (C) 2017-2018 Free Software Foundation, Inc. |
dd5bc7f1 SP |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | /* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */ | |
22 | ||
23 | #define dstin x0 | |
24 | #define src x1 | |
25 | #define count x2 | |
26 | #define dstlen x3 | |
27 | #define dst x3 | |
28 | #define srcend x4 | |
29 | #define dstend x5 | |
30 | #define A_l x6 | |
31 | #define A_lw w6 | |
32 | #define A_h x7 | |
33 | #define A_hw w7 | |
34 | #define B_l x8 | |
35 | #define B_lw w8 | |
36 | #define B_h x9 | |
37 | #define C_l x10 | |
38 | #define C_h x11 | |
39 | #define D_l x12 | |
40 | #define D_h x13 | |
41 | #define E_l src | |
42 | #define E_h count | |
43 | #define F_l srcend | |
44 | #define F_h dst | |
45 | #define tmp1 x14 | |
46 | ||
47 | /* Alias with A_l and A_h to train the prefetcher. */ | |
48 | #define Q_l x22 | |
49 | #define Q_h x23 | |
50 | ||
51 | /* RATIONALE: | |
52 | ||
53 | The copy has 4 distinct parts: | |
54 | * Small copies of 16 bytes and under | |
55 | * Medium sized copies of 17-96 bytes | |
56 | * Large copies where the source address is higher than the destination | |
57 | (forward copies) | |
58 | * Large copies where the destination address is higher than the source | |
59 | (copy backward, or move). | |
60 | ||
61 | We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32 | |
62 | bytes at a time to correctly train the hardware prefetcher for better | |
63 | throughput. */ | |
64 | ENTRY_ALIGN (__memmove_falkor, 6) | |
65 | ||
66 | sub tmp1, dstin, src | |
67 | add srcend, src, count | |
68 | add dstend, dstin, count | |
69 | cmp count, 96 | |
70 | ccmp tmp1, count, 2, hi | |
71 | b.lo L(move_long) | |
72 | ||
73 | cmp count, 16 | |
74 | b.ls L(copy16) | |
75 | cmp count, 96 | |
76 | b.hi L(copy_long) | |
77 | ||
78 | /* Medium copies: 17..96 bytes. */ | |
79 | sub tmp1, count, 1 | |
80 | ldp A_l, A_h, [src] | |
81 | tbnz tmp1, 6, L(copy96) | |
82 | ldp D_l, D_h, [srcend, -16] | |
83 | tbz tmp1, 5, 1f | |
84 | ldp B_l, B_h, [src, 16] | |
85 | ldp C_l, C_h, [srcend, -32] | |
86 | stp B_l, B_h, [dstin, 16] | |
87 | stp C_l, C_h, [dstend, -32] | |
88 | 1: | |
89 | stp A_l, A_h, [dstin] | |
90 | stp D_l, D_h, [dstend, -16] | |
91 | ret | |
92 | ||
93 | .p2align 4 | |
94 | /* Small copies: 0..16 bytes. */ | |
95 | L(copy16): | |
96 | cmp count, 8 | |
97 | b.lo 1f | |
98 | ldr A_l, [src] | |
99 | ldr A_h, [srcend, -8] | |
100 | str A_l, [dstin] | |
101 | str A_h, [dstend, -8] | |
102 | ret | |
103 | .p2align 4 | |
104 | 1: | |
105 | /* 4-7 */ | |
106 | tbz count, 2, 1f | |
107 | ldr A_lw, [src] | |
108 | ldr A_hw, [srcend, -4] | |
109 | str A_lw, [dstin] | |
110 | str A_hw, [dstend, -4] | |
111 | ret | |
112 | .p2align 4 | |
113 | 1: | |
114 | /* 2-3 */ | |
115 | tbz count, 1, 1f | |
116 | ldrh A_lw, [src] | |
117 | ldrh A_hw, [srcend, -2] | |
118 | strh A_lw, [dstin] | |
119 | strh A_hw, [dstend, -2] | |
120 | ret | |
121 | .p2align 4 | |
122 | 1: | |
123 | /* 0-1 */ | |
124 | tbz count, 0, 1f | |
125 | ldrb A_lw, [src] | |
126 | strb A_lw, [dstin] | |
127 | 1: ret | |
128 | ||
129 | .p2align 4 | |
130 | /* Copy 64..96 bytes. Copy 64 bytes from the start and | |
131 | 32 bytes from the end. */ | |
132 | L(copy96): | |
133 | ldp B_l, B_h, [src, 16] | |
134 | ldp C_l, C_h, [src, 32] | |
135 | ldp D_l, D_h, [src, 48] | |
136 | ldp E_l, E_h, [srcend, -32] | |
137 | ldp F_l, F_h, [srcend, -16] | |
138 | stp A_l, A_h, [dstin] | |
139 | stp B_l, B_h, [dstin, 16] | |
140 | stp C_l, C_h, [dstin, 32] | |
141 | stp D_l, D_h, [dstin, 48] | |
142 | stp E_l, E_h, [dstend, -32] | |
143 | stp F_l, F_h, [dstend, -16] | |
144 | ret | |
145 | ||
146 | /* Align SRC to 16 byte alignment so that we don't cross cache line | |
147 | boundaries on both loads and stores. There are at least 96 bytes | |
148 | to copy, so copy 16 bytes unaligned and then align. The loop | |
149 | copies 32 bytes per iteration and prefetches one iteration ahead. */ | |
150 | ||
151 | .p2align 4 | |
152 | L(copy_long): | |
153 | sub count, count, 64 + 16 /* Test and readjust count. */ | |
154 | mov B_l, Q_l | |
155 | mov B_h, Q_h | |
156 | ldp A_l, A_h, [src] | |
157 | and tmp1, src, 15 | |
158 | bic src, src, 15 | |
159 | sub dst, dstin, tmp1 | |
160 | add count, count, tmp1 /* Count is now 16 too large. */ | |
161 | ldp Q_l, Q_h, [src, 16]! | |
162 | stp A_l, A_h, [dstin] | |
163 | ldp A_l, A_h, [src, 16]! | |
164 | ||
165 | L(loop64): | |
166 | subs count, count, 32 | |
167 | stp Q_l, Q_h, [dst, 16] | |
168 | ldp Q_l, Q_h, [src, 16]! | |
169 | stp A_l, A_h, [dst, 32]! | |
170 | ldp A_l, A_h, [src, 16]! | |
171 | b.hi L(loop64) | |
172 | ||
173 | /* Write the last full set of 32 bytes. The remainder is at most 32 | |
174 | bytes, so it is safe to always copy 32 bytes from the end even if | |
175 | there is just 1 byte left. */ | |
176 | L(last64): | |
177 | ldp C_l, C_h, [srcend, -32] | |
178 | stp Q_l, Q_h, [dst, 16] | |
179 | ldp Q_l, Q_h, [srcend, -16] | |
180 | stp A_l, A_h, [dst, 32] | |
181 | stp C_l, C_h, [dstend, -32] | |
182 | stp Q_l, Q_h, [dstend, -16] | |
183 | mov Q_l, B_l | |
184 | mov Q_h, B_h | |
185 | ret | |
186 | ||
187 | .p2align 4 | |
188 | L(move_long): | |
189 | cbz tmp1, 3f | |
190 | ||
191 | mov B_l, Q_l | |
192 | mov B_h, Q_h | |
193 | ||
194 | /* Align SRCEND to 16 byte alignment so that we don't cross cache line | |
195 | boundaries on both loads and stores. There are at least 96 bytes | |
196 | to copy, so copy 16 bytes unaligned and then align. The loop | |
197 | copies 32 bytes per iteration and prefetches one iteration ahead. */ | |
198 | ||
199 | ldp A_l, A_h, [srcend, -16] | |
200 | and tmp1, srcend, 15 | |
201 | sub srcend, srcend, tmp1 | |
202 | ldp Q_l, Q_h, [srcend, -16]! | |
203 | stp A_l, A_h, [dstend, -16] | |
204 | sub count, count, tmp1 | |
205 | ldp A_l, A_h, [srcend, -16]! | |
206 | sub dstend, dstend, tmp1 | |
207 | sub count, count, 64 | |
208 | ||
209 | 1: | |
210 | subs count, count, 32 | |
211 | stp Q_l, Q_h, [dstend, -16] | |
212 | ldp Q_l, Q_h, [srcend, -16]! | |
213 | stp A_l, A_h, [dstend, -32]! | |
214 | ldp A_l, A_h, [srcend, -16]! | |
215 | b.hi 1b | |
216 | ||
217 | /* Write the last full set of 32 bytes. The remainder is at most 32 | |
218 | bytes, so it is safe to always copy 32 bytes from the start even if | |
219 | there is just 1 byte left. */ | |
220 | 2: | |
221 | ldp C_l, C_h, [src, 16] | |
222 | stp Q_l, Q_h, [dstend, -16] | |
223 | ldp Q_l, Q_h, [src] | |
224 | stp A_l, A_h, [dstend, -32] | |
225 | stp C_l, C_h, [dstin, 16] | |
226 | stp Q_l, Q_h, [dstin] | |
227 | mov Q_l, B_l | |
228 | mov Q_h, B_h | |
229 | 3: ret | |
230 | ||
231 | END (__memmove_falkor) | |
232 | libc_hidden_builtin_def (__memmove_falkor) |