]>
Commit | Line | Data |
---|---|---|
04277e02 | 1 | /* Copyright (C) 2017-2019 Free Software Foundation, Inc. |
dd5bc7f1 SP |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
dd5bc7f1 SP |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | /* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */ | |
22 | ||
23 | #define dstin x0 | |
24 | #define src x1 | |
25 | #define count x2 | |
dd5bc7f1 SP |
26 | #define dst x3 |
27 | #define srcend x4 | |
28 | #define dstend x5 | |
ce76a5cb SP |
29 | #define A_x x6 |
30 | #define B_x x7 | |
31 | #define A_w w6 | |
32 | #define B_w w7 | |
dd5bc7f1 SP |
33 | #define tmp1 x14 |
34 | ||
ce76a5cb SP |
35 | #define Q_q q6 |
36 | #define A_q q22 | |
37 | #define B_q q18 | |
38 | #define C_q q19 | |
39 | #define D_q q20 | |
40 | #define E_q q21 | |
41 | #define F_q q17 | |
42 | #define G_q q23 | |
dd5bc7f1 SP |
43 | |
44 | /* RATIONALE: | |
45 | ||
ce76a5cb SP |
46 | The move has 4 distinct parts: |
47 | * Small moves of 16 bytes and under | |
48 | * Medium sized moves of 17-96 bytes | |
49 | * Large moves where the source address is higher than the destination | |
dd5bc7f1 | 50 | (forward copies) |
ce76a5cb | 51 | * Large moves where the destination address is higher than the source |
dd5bc7f1 SP |
52 | (copy backward, or move). |
53 | ||
ce76a5cb SP |
54 | We use only two registers q6 and q22 for the moves and move 32 bytes at a |
55 | time to correctly train the hardware prefetcher for better throughput. */ | |
dd5bc7f1 SP |
56 | ENTRY_ALIGN (__memmove_falkor, 6) |
57 | ||
58 | sub tmp1, dstin, src | |
59 | add srcend, src, count | |
60 | add dstend, dstin, count | |
61 | cmp count, 96 | |
62 | ccmp tmp1, count, 2, hi | |
63 | b.lo L(move_long) | |
64 | ||
65 | cmp count, 16 | |
66 | b.ls L(copy16) | |
67 | cmp count, 96 | |
68 | b.hi L(copy_long) | |
69 | ||
70 | /* Medium copies: 17..96 bytes. */ | |
71 | sub tmp1, count, 1 | |
ce76a5cb | 72 | ldr A_q, [src] |
dd5bc7f1 | 73 | tbnz tmp1, 6, L(copy96) |
ce76a5cb | 74 | ldr D_q, [srcend, -16] |
dd5bc7f1 | 75 | tbz tmp1, 5, 1f |
ce76a5cb SP |
76 | ldr B_q, [src, 16] |
77 | ldr C_q, [srcend, -32] | |
78 | str B_q, [dstin, 16] | |
79 | str C_q, [dstend, -32] | |
dd5bc7f1 | 80 | 1: |
ce76a5cb SP |
81 | str A_q, [dstin] |
82 | str D_q, [dstend, -16] | |
dd5bc7f1 SP |
83 | ret |
84 | ||
85 | .p2align 4 | |
86 | /* Small copies: 0..16 bytes. */ | |
87 | L(copy16): | |
88 | cmp count, 8 | |
89 | b.lo 1f | |
ce76a5cb SP |
90 | ldr A_x, [src] |
91 | ldr B_x, [srcend, -8] | |
92 | str A_x, [dstin] | |
93 | str B_x, [dstend, -8] | |
dd5bc7f1 SP |
94 | ret |
95 | .p2align 4 | |
96 | 1: | |
97 | /* 4-7 */ | |
98 | tbz count, 2, 1f | |
ce76a5cb SP |
99 | ldr A_w, [src] |
100 | ldr B_w, [srcend, -4] | |
101 | str A_w, [dstin] | |
102 | str B_w, [dstend, -4] | |
dd5bc7f1 SP |
103 | ret |
104 | .p2align 4 | |
105 | 1: | |
106 | /* 2-3 */ | |
107 | tbz count, 1, 1f | |
ce76a5cb SP |
108 | ldrh A_w, [src] |
109 | ldrh B_w, [srcend, -2] | |
110 | strh A_w, [dstin] | |
111 | strh B_w, [dstend, -2] | |
dd5bc7f1 SP |
112 | ret |
113 | .p2align 4 | |
114 | 1: | |
115 | /* 0-1 */ | |
116 | tbz count, 0, 1f | |
ce76a5cb SP |
117 | ldrb A_w, [src] |
118 | strb A_w, [dstin] | |
dd5bc7f1 SP |
119 | 1: ret |
120 | ||
121 | .p2align 4 | |
122 | /* Copy 64..96 bytes. Copy 64 bytes from the start and | |
123 | 32 bytes from the end. */ | |
124 | L(copy96): | |
ce76a5cb SP |
125 | ldr B_q, [src, 16] |
126 | ldr C_q, [src, 32] | |
127 | ldr D_q, [src, 48] | |
128 | ldr E_q, [srcend, -32] | |
129 | ldr F_q, [srcend, -16] | |
130 | str A_q, [dstin] | |
131 | str B_q, [dstin, 16] | |
132 | str C_q, [dstin, 32] | |
133 | str D_q, [dstin, 48] | |
134 | str E_q, [dstend, -32] | |
135 | str F_q, [dstend, -16] | |
dd5bc7f1 SP |
136 | ret |
137 | ||
138 | /* Align SRC to 16 byte alignment so that we don't cross cache line | |
139 | boundaries on both loads and stores. There are at least 96 bytes | |
140 | to copy, so copy 16 bytes unaligned and then align. The loop | |
141 | copies 32 bytes per iteration and prefetches one iteration ahead. */ | |
142 | ||
143 | .p2align 4 | |
144 | L(copy_long): | |
ce76a5cb | 145 | ldr A_q, [src] |
dd5bc7f1 SP |
146 | and tmp1, src, 15 |
147 | bic src, src, 15 | |
148 | sub dst, dstin, tmp1 | |
149 | add count, count, tmp1 /* Count is now 16 too large. */ | |
ce76a5cb SP |
150 | ldr Q_q, [src, 16]! |
151 | str A_q, [dstin] | |
152 | ldr A_q, [src, 16]! | |
70c97f84 SP |
153 | subs count, count, 32 + 64 + 16 /* Test and readjust count. */ |
154 | b.ls L(last64) | |
dd5bc7f1 SP |
155 | |
156 | L(loop64): | |
157 | subs count, count, 32 | |
ce76a5cb SP |
158 | str Q_q, [dst, 16] |
159 | ldr Q_q, [src, 16]! | |
160 | str A_q, [dst, 32]! | |
161 | ldr A_q, [src, 16]! | |
dd5bc7f1 SP |
162 | b.hi L(loop64) |
163 | ||
70c97f84 SP |
164 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
165 | bytes and at least 33 bytes, so it is safe to always copy 64 bytes | |
166 | from the end. */ | |
dd5bc7f1 | 167 | L(last64): |
ce76a5cb SP |
168 | ldr C_q, [srcend, -64] |
169 | str Q_q, [dst, 16] | |
170 | ldr B_q, [srcend, -48] | |
171 | str A_q, [dst, 32] | |
172 | ldr A_q, [srcend, -32] | |
173 | ldr D_q, [srcend, -16] | |
174 | str C_q, [dstend, -64] | |
175 | str B_q, [dstend, -48] | |
176 | str A_q, [dstend, -32] | |
177 | str D_q, [dstend, -16] | |
dd5bc7f1 SP |
178 | ret |
179 | ||
180 | .p2align 4 | |
181 | L(move_long): | |
182 | cbz tmp1, 3f | |
183 | ||
dd5bc7f1 SP |
184 | /* Align SRCEND to 16 byte alignment so that we don't cross cache line |
185 | boundaries on both loads and stores. There are at least 96 bytes | |
186 | to copy, so copy 16 bytes unaligned and then align. The loop | |
187 | copies 32 bytes per iteration and prefetches one iteration ahead. */ | |
188 | ||
ce76a5cb | 189 | ldr A_q, [srcend, -16] |
dd5bc7f1 SP |
190 | and tmp1, srcend, 15 |
191 | sub srcend, srcend, tmp1 | |
ce76a5cb SP |
192 | ldr Q_q, [srcend, -16]! |
193 | str A_q, [dstend, -16] | |
dd5bc7f1 | 194 | sub count, count, tmp1 |
ce76a5cb | 195 | ldr A_q, [srcend, -16]! |
dd5bc7f1 | 196 | sub dstend, dstend, tmp1 |
70c97f84 SP |
197 | subs count, count, 32 + 64 |
198 | b.ls 2f | |
dd5bc7f1 SP |
199 | |
200 | 1: | |
201 | subs count, count, 32 | |
ce76a5cb SP |
202 | str Q_q, [dstend, -16] |
203 | ldr Q_q, [srcend, -16]! | |
204 | str A_q, [dstend, -32]! | |
205 | ldr A_q, [srcend, -16]! | |
dd5bc7f1 SP |
206 | b.hi 1b |
207 | ||
70c97f84 SP |
208 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
209 | bytes and at least 33 bytes, so it is safe to always copy 64 bytes | |
210 | from the start. */ | |
dd5bc7f1 | 211 | 2: |
ce76a5cb SP |
212 | ldr C_q, [src, 48] |
213 | str Q_q, [dstend, -16] | |
214 | ldr B_q, [src, 32] | |
215 | str A_q, [dstend, -32] | |
216 | ldr A_q, [src, 16] | |
217 | ldr D_q, [src] | |
218 | str C_q, [dstin, 48] | |
219 | str B_q, [dstin, 32] | |
220 | str A_q, [dstin, 16] | |
221 | str D_q, [dstin] | |
dd5bc7f1 SP |
222 | 3: ret |
223 | ||
224 | END (__memmove_falkor) | |
225 | libc_hidden_builtin_def (__memmove_falkor) |