]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / sparc / sparc64 / multiarch / memcpy-niagara4.S
1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-4.
2 Copyright (C) 2012-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
23
24 #define FPRS_FEF 0x04
25
26 /* On T4 it is very expensive to access ASRs like %fprs and
27 * %asi, avoiding a read or a write can save ~50 cycles.
28 */
29 #define FPU_ENTER \
30 rd %fprs, %o5; \
31 andcc %o5, FPRS_FEF, %g0; \
32 be,a,pn %icc, 999f; \
33 wr %g0, FPRS_FEF, %fprs; \
34 999:
35
36 #define VISEntryHalf FPU_ENTER
37 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
38
39 #define GLOBAL_SPARE %g5
40
41 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
42 #define EX_LD(x) x
43 #define EX_ST(x) x
44 #define EX_RETVAL(x) x
45 #define LOAD(type,addr,dest) type [addr], dest
46 #define STORE(type,src,addr) type src, [addr]
47 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
48
49 #if IS_IN (libc)
50
51 .register %g2,#scratch
52 .register %g3,#scratch
53 .register %g6,#scratch
54
55 .text
56
57 ENTRY(__mempcpy_niagara4)
58 ba,pt %icc, 101f
59 add %o0, %o2, %o3
60 END(__mempcpy_niagara4)
61
62 .align 32
63 ENTRY(__memcpy_niagara4)
64 100: /* %o0=dst, %o1=src, %o2=len */
65 mov %o0, %o3
66 101:
67 #ifndef __arch64__
68 srl %o2, 0, %o2
69 #endif
70 brz,pn %o2, .Lexit
71 cmp %o2, 3
72 ble,pn %icc, .Ltiny
73 cmp %o2, 19
74 ble,pn %icc, .Lsmall
75 or %o0, %o1, %g2
76 cmp %o2, 128
77 bl,pn %icc, .Lmedium
78 nop
79
80 .Llarge:/* len >= 0x80 */
81 /* First get dest 8 byte aligned. */
82 sub %g0, %o0, %g1
83 and %g1, 0x7, %g1
84 brz,pt %g1, 51f
85 sub %o2, %g1, %o2
86
87 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
88 add %o1, 1, %o1
89 subcc %g1, 1, %g1
90 add %o0, 1, %o0
91 bne,pt %icc, 1b
92 EX_ST(STORE(stb, %g2, %o0 - 0x01))
93
94 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
95 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
96 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
97 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
98 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
99 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
100 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
101 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
102
103 /* Check if we can use the straight fully aligned
104 * loop, or we require the alignaddr/faligndata variant.
105 */
106 andcc %o1, 0x7, %o5
107 bne,pn %icc, .Llarge_src_unaligned
108 sub %g0, %o0, %g1
109
110 /* Legitimize the use of initializing stores by getting dest
111 * to be 64-byte aligned.
112 */
113 and %g1, 0x3f, %g1
114 brz,pt %g1, .Llarge_aligned
115 sub %o2, %g1, %o2
116
117 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
118 add %o1, 8, %o1
119 subcc %g1, 8, %g1
120 add %o0, 8, %o0
121 bne,pt %icc, 1b
122 EX_ST(STORE(stx, %g2, %o0 - 0x08))
123
124 .Llarge_aligned:
125 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
126 andn %o2, 0x3f, %o4
127 sub %o2, %o4, %o2
128
129 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
130 add %o1, 0x40, %o1
131 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
132 subcc %o4, 0x40, %o4
133 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
134 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
135 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
136 EX_ST(STORE_INIT(%g1, %o0))
137 add %o0, 0x08, %o0
138 EX_ST(STORE_INIT(%g2, %o0))
139 add %o0, 0x08, %o0
140 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
141 EX_ST(STORE_INIT(%g3, %o0))
142 add %o0, 0x08, %o0
143 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
144 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
145 add %o0, 0x08, %o0
146 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
147 EX_ST(STORE_INIT(%o5, %o0))
148 add %o0, 0x08, %o0
149 EX_ST(STORE_INIT(%g2, %o0))
150 add %o0, 0x08, %o0
151 EX_ST(STORE_INIT(%g3, %o0))
152 add %o0, 0x08, %o0
153 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
154 add %o0, 0x08, %o0
155 bne,pt %icc, 1b
156 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
157
158 membar #StoreLoad | #StoreStore
159
160 brz,pn %o2, .Lexit
161 cmp %o2, 19
162 ble,pn %icc, .Lsmall_unaligned
163 nop
164 ba,a,pt %icc, .Lmedium_noprefetch
165
166 .Lexit: retl
167 mov EX_RETVAL(%o3), %o0
168
169 .Llarge_src_unaligned:
170 andn %o2, 0x3f, %o4
171 sub %o2, %o4, %o2
172 VISEntryHalf
173 alignaddr %o1, %g0, %g1
174 add %o1, %o4, %o1
175 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
176 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
177 subcc %o4, 0x40, %o4
178 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
179 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
180 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
181 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
182 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
183 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
184 faligndata %f0, %f2, %f16
185 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
186 faligndata %f2, %f4, %f18
187 add %g1, 0x40, %g1
188 faligndata %f4, %f6, %f20
189 faligndata %f6, %f8, %f22
190 faligndata %f8, %f10, %f24
191 faligndata %f10, %f12, %f26
192 faligndata %f12, %f14, %f28
193 faligndata %f14, %f0, %f30
194 EX_ST(STORE(std, %f16, %o0 + 0x00))
195 EX_ST(STORE(std, %f18, %o0 + 0x08))
196 EX_ST(STORE(std, %f20, %o0 + 0x10))
197 EX_ST(STORE(std, %f22, %o0 + 0x18))
198 EX_ST(STORE(std, %f24, %o0 + 0x20))
199 EX_ST(STORE(std, %f26, %o0 + 0x28))
200 EX_ST(STORE(std, %f28, %o0 + 0x30))
201 EX_ST(STORE(std, %f30, %o0 + 0x38))
202 add %o0, 0x40, %o0
203 bne,pt %icc, 1b
204 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
205 VISExitHalf
206
207 brz,pn %o2, .Lexit
208 cmp %o2, 19
209 ble,pn %icc, .Lsmall_unaligned
210 nop
211 ba,a,pt %icc, .Lmedium_unaligned
212
213 .Lmedium:
214 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
215 andcc %g2, 0x7, %g0
216 bne,pn %icc, .Lmedium_unaligned
217 nop
218 .Lmedium_noprefetch:
219 andncc %o2, 0x20 - 1, %o5
220 be,pn %icc, 2f
221 sub %o2, %o5, %o2
222 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
223 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
224 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
225 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
226 add %o1, 0x20, %o1
227 subcc %o5, 0x20, %o5
228 EX_ST(STORE(stx, %g1, %o0 + 0x00))
229 EX_ST(STORE(stx, %g2, %o0 + 0x08))
230 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
231 EX_ST(STORE(stx, %o4, %o0 + 0x18))
232 bne,pt %icc, 1b
233 add %o0, 0x20, %o0
234 2: andcc %o2, 0x18, %o5
235 be,pt %icc, 3f
236 sub %o2, %o5, %o2
237 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
238 add %o1, 0x08, %o1
239 add %o0, 0x08, %o0
240 subcc %o5, 0x08, %o5
241 bne,pt %icc, 1b
242 EX_ST(STORE(stx, %g1, %o0 - 0x08))
243 3: brz,pt %o2, .Lexit
244 cmp %o2, 0x04
245 bl,pn %icc, .Ltiny
246 nop
247 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
248 add %o1, 0x04, %o1
249 add %o0, 0x04, %o0
250 subcc %o2, 0x04, %o2
251 bne,pn %icc, .Ltiny
252 EX_ST(STORE(stw, %g1, %o0 - 0x04))
253 ba,a,pt %icc, .Lexit
254 .Lmedium_unaligned:
255 /* First get dest 8 byte aligned. */
256 sub %g0, %o0, %g1
257 and %g1, 0x7, %g1
258 brz,pt %g1, 2f
259 sub %o2, %g1, %o2
260
261 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
262 add %o1, 1, %o1
263 subcc %g1, 1, %g1
264 add %o0, 1, %o0
265 bne,pt %icc, 1b
266 EX_ST(STORE(stb, %g2, %o0 - 0x01))
267 2:
268 and %o1, 0x7, %g1
269 brz,pn %g1, .Lmedium_noprefetch
270 sll %g1, 3, %g1
271 mov 64, %g2
272 sub %g2, %g1, %g2
273 andn %o1, 0x7, %o1
274 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
275 sllx %o4, %g1, %o4
276 andn %o2, 0x08 - 1, %o5
277 sub %o2, %o5, %o2
278 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
279 add %o1, 0x08, %o1
280 subcc %o5, 0x08, %o5
281 srlx %g3, %g2, GLOBAL_SPARE
282 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
283 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
284 add %o0, 0x08, %o0
285 bne,pt %icc, 1b
286 sllx %g3, %g1, %o4
287 srl %g1, 3, %g1
288 add %o1, %g1, %o1
289 brz,pn %o2, .Lexit
290 nop
291 ba,pt %icc, .Lsmall_unaligned
292
293 .Ltiny:
294 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
295 subcc %o2, 1, %o2
296 be,pn %icc, .Lexit
297 EX_ST(STORE(stb, %g1, %o0 + 0x00))
298 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
299 subcc %o2, 1, %o2
300 be,pn %icc, .Lexit
301 EX_ST(STORE(stb, %g1, %o0 + 0x01))
302 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
303 ba,pt %icc, .Lexit
304 EX_ST(STORE(stb, %g1, %o0 + 0x02))
305
306 .Lsmall:
307 andcc %g2, 0x3, %g0
308 bne,pn %icc, .Lsmall_unaligned
309 andn %o2, 0x4 - 1, %o5
310 sub %o2, %o5, %o2
311 1:
312 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
313 add %o1, 0x04, %o1
314 subcc %o5, 0x04, %o5
315 add %o0, 0x04, %o0
316 bne,pt %icc, 1b
317 EX_ST(STORE(stw, %g1, %o0 - 0x04))
318 brz,pt %o2, .Lexit
319 nop
320 ba,a,pt %icc, .Ltiny
321
322 .Lsmall_unaligned:
323 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
324 add %o1, 1, %o1
325 add %o0, 1, %o0
326 subcc %o2, 1, %o2
327 bne,pt %icc, 1b
328 EX_ST(STORE(stb, %g1, %o0 - 0x01))
329 ba,a,pt %icc, .Lexit
330 END(__memcpy_niagara4)
331
332 #endif