]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/multiarch/memmove_falkor.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memmove_falkor.S
CommitLineData
688903eb 1/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
dd5bc7f1
SP
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
22
23#define dstin x0
24#define src x1
25#define count x2
26#define dstlen x3
27#define dst x3
28#define srcend x4
29#define dstend x5
30#define A_l x6
31#define A_lw w6
32#define A_h x7
33#define A_hw w7
34#define B_l x8
35#define B_lw w8
36#define B_h x9
37#define C_l x10
38#define C_h x11
39#define D_l x12
40#define D_h x13
41#define E_l src
42#define E_h count
43#define F_l srcend
44#define F_h dst
45#define tmp1 x14
46
47/* Alias with A_l and A_h to train the prefetcher. */
48#define Q_l x22
49#define Q_h x23
50
51/* RATIONALE:
52
53 The copy has 4 distinct parts:
54 * Small copies of 16 bytes and under
55 * Medium sized copies of 17-96 bytes
56 * Large copies where the source address is higher than the destination
57 (forward copies)
58 * Large copies where the destination address is higher than the source
59 (copy backward, or move).
60
61 We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32
62 bytes at a time to correctly train the hardware prefetcher for better
63 throughput. */
64ENTRY_ALIGN (__memmove_falkor, 6)
65
66 sub tmp1, dstin, src
67 add srcend, src, count
68 add dstend, dstin, count
69 cmp count, 96
70 ccmp tmp1, count, 2, hi
71 b.lo L(move_long)
72
73 cmp count, 16
74 b.ls L(copy16)
75 cmp count, 96
76 b.hi L(copy_long)
77
78 /* Medium copies: 17..96 bytes. */
79 sub tmp1, count, 1
80 ldp A_l, A_h, [src]
81 tbnz tmp1, 6, L(copy96)
82 ldp D_l, D_h, [srcend, -16]
83 tbz tmp1, 5, 1f
84 ldp B_l, B_h, [src, 16]
85 ldp C_l, C_h, [srcend, -32]
86 stp B_l, B_h, [dstin, 16]
87 stp C_l, C_h, [dstend, -32]
881:
89 stp A_l, A_h, [dstin]
90 stp D_l, D_h, [dstend, -16]
91 ret
92
93 .p2align 4
94 /* Small copies: 0..16 bytes. */
95L(copy16):
96 cmp count, 8
97 b.lo 1f
98 ldr A_l, [src]
99 ldr A_h, [srcend, -8]
100 str A_l, [dstin]
101 str A_h, [dstend, -8]
102 ret
103 .p2align 4
1041:
105 /* 4-7 */
106 tbz count, 2, 1f
107 ldr A_lw, [src]
108 ldr A_hw, [srcend, -4]
109 str A_lw, [dstin]
110 str A_hw, [dstend, -4]
111 ret
112 .p2align 4
1131:
114 /* 2-3 */
115 tbz count, 1, 1f
116 ldrh A_lw, [src]
117 ldrh A_hw, [srcend, -2]
118 strh A_lw, [dstin]
119 strh A_hw, [dstend, -2]
120 ret
121 .p2align 4
1221:
123 /* 0-1 */
124 tbz count, 0, 1f
125 ldrb A_lw, [src]
126 strb A_lw, [dstin]
1271: ret
128
129 .p2align 4
130 /* Copy 64..96 bytes. Copy 64 bytes from the start and
131 32 bytes from the end. */
132L(copy96):
133 ldp B_l, B_h, [src, 16]
134 ldp C_l, C_h, [src, 32]
135 ldp D_l, D_h, [src, 48]
136 ldp E_l, E_h, [srcend, -32]
137 ldp F_l, F_h, [srcend, -16]
138 stp A_l, A_h, [dstin]
139 stp B_l, B_h, [dstin, 16]
140 stp C_l, C_h, [dstin, 32]
141 stp D_l, D_h, [dstin, 48]
142 stp E_l, E_h, [dstend, -32]
143 stp F_l, F_h, [dstend, -16]
144 ret
145
146 /* Align SRC to 16 byte alignment so that we don't cross cache line
147 boundaries on both loads and stores. There are at least 96 bytes
148 to copy, so copy 16 bytes unaligned and then align. The loop
149 copies 32 bytes per iteration and prefetches one iteration ahead. */
150
151 .p2align 4
152L(copy_long):
153 sub count, count, 64 + 16 /* Test and readjust count. */
154 mov B_l, Q_l
155 mov B_h, Q_h
156 ldp A_l, A_h, [src]
157 and tmp1, src, 15
158 bic src, src, 15
159 sub dst, dstin, tmp1
160 add count, count, tmp1 /* Count is now 16 too large. */
161 ldp Q_l, Q_h, [src, 16]!
162 stp A_l, A_h, [dstin]
163 ldp A_l, A_h, [src, 16]!
164
165L(loop64):
166 subs count, count, 32
167 stp Q_l, Q_h, [dst, 16]
168 ldp Q_l, Q_h, [src, 16]!
169 stp A_l, A_h, [dst, 32]!
170 ldp A_l, A_h, [src, 16]!
171 b.hi L(loop64)
172
173 /* Write the last full set of 32 bytes. The remainder is at most 32
174 bytes, so it is safe to always copy 32 bytes from the end even if
175 there is just 1 byte left. */
176L(last64):
177 ldp C_l, C_h, [srcend, -32]
178 stp Q_l, Q_h, [dst, 16]
179 ldp Q_l, Q_h, [srcend, -16]
180 stp A_l, A_h, [dst, 32]
181 stp C_l, C_h, [dstend, -32]
182 stp Q_l, Q_h, [dstend, -16]
183 mov Q_l, B_l
184 mov Q_h, B_h
185 ret
186
187 .p2align 4
188L(move_long):
189 cbz tmp1, 3f
190
191 mov B_l, Q_l
192 mov B_h, Q_h
193
194 /* Align SRCEND to 16 byte alignment so that we don't cross cache line
195 boundaries on both loads and stores. There are at least 96 bytes
196 to copy, so copy 16 bytes unaligned and then align. The loop
197 copies 32 bytes per iteration and prefetches one iteration ahead. */
198
199 ldp A_l, A_h, [srcend, -16]
200 and tmp1, srcend, 15
201 sub srcend, srcend, tmp1
202 ldp Q_l, Q_h, [srcend, -16]!
203 stp A_l, A_h, [dstend, -16]
204 sub count, count, tmp1
205 ldp A_l, A_h, [srcend, -16]!
206 sub dstend, dstend, tmp1
207 sub count, count, 64
208
2091:
210 subs count, count, 32
211 stp Q_l, Q_h, [dstend, -16]
212 ldp Q_l, Q_h, [srcend, -16]!
213 stp A_l, A_h, [dstend, -32]!
214 ldp A_l, A_h, [srcend, -16]!
215 b.hi 1b
216
217 /* Write the last full set of 32 bytes. The remainder is at most 32
218 bytes, so it is safe to always copy 32 bytes from the start even if
219 there is just 1 byte left. */
2202:
221 ldp C_l, C_h, [src, 16]
222 stp Q_l, Q_h, [dstend, -16]
223 ldp Q_l, Q_h, [src]
224 stp A_l, A_h, [dstend, -32]
225 stp C_l, C_h, [dstin, 16]
226 stp Q_l, Q_h, [dstin]
227 mov Q_l, B_l
228 mov Q_h, B_h
2293: ret
230
231END (__memmove_falkor)
232libc_hidden_builtin_def (__memmove_falkor)