]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/multiarch/memcpy_falkor.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memcpy_falkor.S
1 /* Optimized memcpy for Qualcomm Falkor processor.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* Assumptions:
23
24 ARMv8-a, AArch64, falkor, unaligned accesses. */
25
26 #define dstin x0
27 #define src x1
28 #define count x2
29 #define dst x3
30 #define srcend x4
31 #define dstend x5
32 #define A_l x6
33 #define A_lw w6
34 #define A_h x7
35 #define A_hw w7
36 #define tmp1 x14
37
38 /* Copies are split into 3 main cases:
39
40 1. Small copies of up to 32 bytes
41 2. Medium copies of 33..128 bytes which are fully unrolled
42 3. Large copies of more than 128 bytes.
43
44 Large copies align the sourceto a quad word and use an unrolled loop
45 processing 64 bytes per iteration.
46
47 FALKOR-SPECIFIC DESIGN:
48
49 The smallest copies (32 bytes or less) focus on optimal pipeline usage,
50 which is why the redundant copies of 0-3 bytes have been replaced with
51 conditionals, since the former would unnecessarily break across multiple
52 issue groups. The medium copy group has been enlarged to 128 bytes since
53 bumping up the small copies up to 32 bytes allows us to do that without
54 cost and also allows us to reduce the size of the prep code before loop64.
55
56 All copies are done only via two registers r6 and r7. This is to ensure
57 that all loads hit a single hardware prefetcher which can get correctly
58 trained to prefetch a single stream.
59
60 The non-temporal stores help optimize cache utilization. */
61
62 #if IS_IN (libc)
63 ENTRY_ALIGN (__memcpy_falkor, 6)
64
65 cmp count, 32
66 add srcend, src, count
67 add dstend, dstin, count
68 b.ls L(copy32)
69 ldp A_l, A_h, [src]
70 cmp count, 128
71 stp A_l, A_h, [dstin]
72 b.hi L(copy_long)
73
74 /* Medium copies: 33..128 bytes. */
75 sub tmp1, count, 1
76 ldp A_l, A_h, [src, 16]
77 stp A_l, A_h, [dstin, 16]
78 tbz tmp1, 6, 1f
79 ldp A_l, A_h, [src, 32]
80 stp A_l, A_h, [dstin, 32]
81 ldp A_l, A_h, [src, 48]
82 stp A_l, A_h, [dstin, 48]
83 ldp A_l, A_h, [srcend, -64]
84 stp A_l, A_h, [dstend, -64]
85 ldp A_l, A_h, [srcend, -48]
86 stp A_l, A_h, [dstend, -48]
87 1:
88 ldp A_l, A_h, [srcend, -32]
89 stp A_l, A_h, [dstend, -32]
90 ldp A_l, A_h, [srcend, -16]
91 stp A_l, A_h, [dstend, -16]
92 ret
93
94 .p2align 4
95 /* Small copies: 0..32 bytes. */
96 L(copy32):
97 /* 16-32 */
98 cmp count, 16
99 b.lo 1f
100 ldp A_l, A_h, [src]
101 stp A_l, A_h, [dstin]
102 ldp A_l, A_h, [srcend, -16]
103 stp A_l, A_h, [dstend, -16]
104 ret
105 .p2align 4
106 1:
107 /* 8-15 */
108 tbz count, 3, 1f
109 ldr A_l, [src]
110 str A_l, [dstin]
111 ldr A_l, [srcend, -8]
112 str A_l, [dstend, -8]
113 ret
114 .p2align 4
115 1:
116 /* 4-7 */
117 tbz count, 2, 1f
118 ldr A_lw, [src]
119 str A_lw, [dstin]
120 ldr A_lw, [srcend, -4]
121 str A_lw, [dstend, -4]
122 ret
123 .p2align 4
124 1:
125 /* 2-3 */
126 tbz count, 1, 1f
127 ldrh A_lw, [src]
128 strh A_lw, [dstin]
129 ldrh A_lw, [srcend, -2]
130 strh A_lw, [dstend, -2]
131 ret
132 .p2align 4
133 1:
134 /* 0-1 */
135 tbz count, 0, 1f
136 ldrb A_lw, [src]
137 strb A_lw, [dstin]
138 1:
139 ret
140
141 /* Align SRC to 16 bytes and copy; that way at least one of the
142 accesses is aligned throughout the copy sequence.
143
144 The count is off by 0 to 15 bytes, but this is OK because we trim
145 off the last 64 bytes to copy off from the end. Due to this the
146 loop never runs out of bounds. */
147 .p2align 6
148 L(copy_long):
149 sub count, count, 64 + 16
150 and tmp1, src, 15
151 bic src, src, 15
152 sub dst, dstin, tmp1
153 add count, count, tmp1
154
155 L(loop64):
156 ldp A_l, A_h, [src, 16]!
157 stnp A_l, A_h, [dst, 16]
158 ldp A_l, A_h, [src, 16]!
159 subs count, count, 64
160 stnp A_l, A_h, [dst, 32]
161 ldp A_l, A_h, [src, 16]!
162 stnp A_l, A_h, [dst, 48]
163 ldp A_l, A_h, [src, 16]!
164 stnp A_l, A_h, [dst, 64]
165 add dst, dst, 64
166 b.hi L(loop64)
167
168 /* Write the last full set of 64 bytes. The remainder is at most 64
169 bytes, so it is safe to always copy 64 bytes from the end even if
170 there is just 1 byte left. */
171 L(last64):
172 ldp A_l, A_h, [srcend, -64]
173 stnp A_l, A_h, [dstend, -64]
174 ldp A_l, A_h, [srcend, -48]
175 stnp A_l, A_h, [dstend, -48]
176 ldp A_l, A_h, [srcend, -32]
177 stnp A_l, A_h, [dstend, -32]
178 ldp A_l, A_h, [srcend, -16]
179 stnp A_l, A_h, [dstend, -16]
180 ret
181
182 END (__memcpy_falkor)
183 libc_hidden_builtin_def (__memcpy_falkor)
184 #endif