]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/multiarch/memmove_falkor.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memmove_falkor.S
CommitLineData
04277e02 1/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
dd5bc7f1
SP
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
dd5bc7f1
SP
18
19#include <sysdep.h>
20
21/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
22
23#define dstin x0
24#define src x1
25#define count x2
dd5bc7f1
SP
26#define dst x3
27#define srcend x4
28#define dstend x5
ce76a5cb
SP
29#define A_x x6
30#define B_x x7
31#define A_w w6
32#define B_w w7
dd5bc7f1
SP
33#define tmp1 x14
34
ce76a5cb
SP
35#define Q_q q6
36#define A_q q22
37#define B_q q18
38#define C_q q19
39#define D_q q20
40#define E_q q21
41#define F_q q17
42#define G_q q23
dd5bc7f1
SP
43
44/* RATIONALE:
45
ce76a5cb
SP
46 The move has 4 distinct parts:
47 * Small moves of 16 bytes and under
48 * Medium sized moves of 17-96 bytes
49 * Large moves where the source address is higher than the destination
dd5bc7f1 50 (forward copies)
ce76a5cb 51 * Large moves where the destination address is higher than the source
dd5bc7f1
SP
52 (copy backward, or move).
53
ce76a5cb
SP
54 We use only two registers q6 and q22 for the moves and move 32 bytes at a
55 time to correctly train the hardware prefetcher for better throughput. */
dd5bc7f1
SP
56ENTRY_ALIGN (__memmove_falkor, 6)
57
58 sub tmp1, dstin, src
59 add srcend, src, count
60 add dstend, dstin, count
61 cmp count, 96
62 ccmp tmp1, count, 2, hi
63 b.lo L(move_long)
64
65 cmp count, 16
66 b.ls L(copy16)
67 cmp count, 96
68 b.hi L(copy_long)
69
70 /* Medium copies: 17..96 bytes. */
71 sub tmp1, count, 1
ce76a5cb 72 ldr A_q, [src]
dd5bc7f1 73 tbnz tmp1, 6, L(copy96)
ce76a5cb 74 ldr D_q, [srcend, -16]
dd5bc7f1 75 tbz tmp1, 5, 1f
ce76a5cb
SP
76 ldr B_q, [src, 16]
77 ldr C_q, [srcend, -32]
78 str B_q, [dstin, 16]
79 str C_q, [dstend, -32]
dd5bc7f1 801:
ce76a5cb
SP
81 str A_q, [dstin]
82 str D_q, [dstend, -16]
dd5bc7f1
SP
83 ret
84
85 .p2align 4
86 /* Small copies: 0..16 bytes. */
87L(copy16):
88 cmp count, 8
89 b.lo 1f
ce76a5cb
SP
90 ldr A_x, [src]
91 ldr B_x, [srcend, -8]
92 str A_x, [dstin]
93 str B_x, [dstend, -8]
dd5bc7f1
SP
94 ret
95 .p2align 4
961:
97 /* 4-7 */
98 tbz count, 2, 1f
ce76a5cb
SP
99 ldr A_w, [src]
100 ldr B_w, [srcend, -4]
101 str A_w, [dstin]
102 str B_w, [dstend, -4]
dd5bc7f1
SP
103 ret
104 .p2align 4
1051:
106 /* 2-3 */
107 tbz count, 1, 1f
ce76a5cb
SP
108 ldrh A_w, [src]
109 ldrh B_w, [srcend, -2]
110 strh A_w, [dstin]
111 strh B_w, [dstend, -2]
dd5bc7f1
SP
112 ret
113 .p2align 4
1141:
115 /* 0-1 */
116 tbz count, 0, 1f
ce76a5cb
SP
117 ldrb A_w, [src]
118 strb A_w, [dstin]
dd5bc7f1
SP
1191: ret
120
121 .p2align 4
122 /* Copy 64..96 bytes. Copy 64 bytes from the start and
123 32 bytes from the end. */
124L(copy96):
ce76a5cb
SP
125 ldr B_q, [src, 16]
126 ldr C_q, [src, 32]
127 ldr D_q, [src, 48]
128 ldr E_q, [srcend, -32]
129 ldr F_q, [srcend, -16]
130 str A_q, [dstin]
131 str B_q, [dstin, 16]
132 str C_q, [dstin, 32]
133 str D_q, [dstin, 48]
134 str E_q, [dstend, -32]
135 str F_q, [dstend, -16]
dd5bc7f1
SP
136 ret
137
138 /* Align SRC to 16 byte alignment so that we don't cross cache line
139 boundaries on both loads and stores. There are at least 96 bytes
140 to copy, so copy 16 bytes unaligned and then align. The loop
141 copies 32 bytes per iteration and prefetches one iteration ahead. */
142
143 .p2align 4
144L(copy_long):
ce76a5cb 145 ldr A_q, [src]
dd5bc7f1
SP
146 and tmp1, src, 15
147 bic src, src, 15
148 sub dst, dstin, tmp1
149 add count, count, tmp1 /* Count is now 16 too large. */
ce76a5cb
SP
150 ldr Q_q, [src, 16]!
151 str A_q, [dstin]
152 ldr A_q, [src, 16]!
70c97f84
SP
153 subs count, count, 32 + 64 + 16 /* Test and readjust count. */
154 b.ls L(last64)
dd5bc7f1
SP
155
156L(loop64):
157 subs count, count, 32
ce76a5cb
SP
158 str Q_q, [dst, 16]
159 ldr Q_q, [src, 16]!
160 str A_q, [dst, 32]!
161 ldr A_q, [src, 16]!
dd5bc7f1
SP
162 b.hi L(loop64)
163
70c97f84
SP
164 /* Write the last full set of 64 bytes. The remainder is at most 64
165 bytes and at least 33 bytes, so it is safe to always copy 64 bytes
166 from the end. */
dd5bc7f1 167L(last64):
ce76a5cb
SP
168 ldr C_q, [srcend, -64]
169 str Q_q, [dst, 16]
170 ldr B_q, [srcend, -48]
171 str A_q, [dst, 32]
172 ldr A_q, [srcend, -32]
173 ldr D_q, [srcend, -16]
174 str C_q, [dstend, -64]
175 str B_q, [dstend, -48]
176 str A_q, [dstend, -32]
177 str D_q, [dstend, -16]
dd5bc7f1
SP
178 ret
179
180 .p2align 4
181L(move_long):
182 cbz tmp1, 3f
183
dd5bc7f1
SP
184 /* Align SRCEND to 16 byte alignment so that we don't cross cache line
185 boundaries on both loads and stores. There are at least 96 bytes
186 to copy, so copy 16 bytes unaligned and then align. The loop
187 copies 32 bytes per iteration and prefetches one iteration ahead. */
188
ce76a5cb 189 ldr A_q, [srcend, -16]
dd5bc7f1
SP
190 and tmp1, srcend, 15
191 sub srcend, srcend, tmp1
ce76a5cb
SP
192 ldr Q_q, [srcend, -16]!
193 str A_q, [dstend, -16]
dd5bc7f1 194 sub count, count, tmp1
ce76a5cb 195 ldr A_q, [srcend, -16]!
dd5bc7f1 196 sub dstend, dstend, tmp1
70c97f84
SP
197 subs count, count, 32 + 64
198 b.ls 2f
dd5bc7f1
SP
199
2001:
201 subs count, count, 32
ce76a5cb
SP
202 str Q_q, [dstend, -16]
203 ldr Q_q, [srcend, -16]!
204 str A_q, [dstend, -32]!
205 ldr A_q, [srcend, -16]!
dd5bc7f1
SP
206 b.hi 1b
207
70c97f84
SP
208 /* Write the last full set of 64 bytes. The remainder is at most 64
209 bytes and at least 33 bytes, so it is safe to always copy 64 bytes
210 from the start. */
dd5bc7f1 2112:
ce76a5cb
SP
212 ldr C_q, [src, 48]
213 str Q_q, [dstend, -16]
214 ldr B_q, [src, 32]
215 str A_q, [dstend, -32]
216 ldr A_q, [src, 16]
217 ldr D_q, [src]
218 str C_q, [dstin, 48]
219 str B_q, [dstin, 32]
220 str A_q, [dstin, 16]
221 str D_q, [dstin]
dd5bc7f1
SP
2223: ret
223
224END (__memmove_falkor)
225libc_hidden_builtin_def (__memmove_falkor)