]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
a1c558840a7127a423c1cdff1d393d1b577d537f
[thirdparty/glibc.git] / sysdeps / sparc / sparc64 / multiarch / memcpy-niagara1.S
1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara.
2 Copyright (C) 2006-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
23 #define ASI_P 0x80
24 #define ASI_PNF 0x82
25
26 #define LOAD(type,addr,dest) type##a [addr] ASI_P, dest
27 #define LOAD_TWIN(addr_reg,dest0,dest1) \
28 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
29
30 #define STORE(type,src,addr) type src, [addr]
31 #define STORE_INIT(src,addr) stxa src, [addr] %asi
32
33 #ifndef XCC
34 #define USE_BPR
35 #define XCC xcc
36 #endif
37
38 #if IS_IN (libc)
39
40 .register %g2,#scratch
41 .register %g3,#scratch
42 .register %g6,#scratch
43
44 .text
45
46 ENTRY(__mempcpy_niagara1)
47 ba,pt %XCC, 101f
48 add %o0, %o2, %g5
49 END(__mempcpy_niagara1)
50
51 .align 32
52 ENTRY(__memcpy_niagara1)
53 100: /* %o0=dst, %o1=src, %o2=len */
54 mov %o0, %g5
55 101:
56 # ifndef USE_BPR
57 srl %o2, 0, %o2
58 # endif
59 cmp %o2, 0
60 be,pn %XCC, 85f
61 218: or %o0, %o1, %o3
62 cmp %o2, 16
63 blu,a,pn %XCC, 80f
64 or %o3, %o2, %o3
65
66 /* 2 blocks (128 bytes) is the minimum we can do the block
67 * copy with. We need to ensure that we'll iterate at least
68 * once in the block copy loop. At worst we'll need to align
69 * the destination to a 64-byte boundary which can chew up
70 * to (64 - 1) bytes from the length before we perform the
71 * block copy loop.
72 */
73 cmp %o2, (2 * 64)
74 blu,pt %XCC, 70f
75 andcc %o3, 0x7, %g0
76
77 /* %o0: dst
78 * %o1: src
79 * %o2: len (known to be >= 128)
80 *
81 * The block copy loops will use %o4/%o5,%g2/%g3 as
82 * temporaries while copying the data.
83 */
84
85 LOAD(prefetch, %o1, #one_read)
86 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
87
88 /* Align destination on 64-byte boundary. */
89 andcc %o0, (64 - 1), %o4
90 be,pt %XCC, 2f
91 sub %o4, 64, %o4
92 sub %g0, %o4, %o4 ! bytes to align dst
93 sub %o2, %o4, %o2
94 1: subcc %o4, 1, %o4
95 LOAD(ldub, %o1, %g1)
96 STORE(stb, %g1, %o0)
97 add %o1, 1, %o1
98 bne,pt %XCC, 1b
99 add %o0, 1, %o0
100
101 /* If the source is on a 16-byte boundary we can do
102 * the direct block copy loop. If it is 8-byte aligned
103 * we can do the 16-byte loads offset by -8 bytes and the
104 * init stores offset by one register.
105 *
106 * If the source is not even 8-byte aligned, we need to do
107 * shifting and masking (basically integer faligndata).
108 *
109 * The careful bit with init stores is that if we store
110 * to any part of the cache line we have to store the whole
111 * cacheline else we can end up with corrupt L2 cache line
112 * contents. Since the loop works on 64-bytes of 64-byte
113 * aligned store data at a time, this is easy to ensure.
114 */
115 2:
116 andcc %o1, (16 - 1), %o4
117 andn %o2, (64 - 1), %g1 ! block copy loop iterator
118 sub %o2, %g1, %o2 ! final sub-block copy bytes
119 be,pt %XCC, 50f
120 cmp %o4, 8
121 be,a,pt %XCC, 10f
122 sub %o1, 0x8, %o1
123
124 /* Neither 8-byte nor 16-byte aligned, shift and mask. */
125 mov %g1, %o4
126 and %o1, 0x7, %g1
127 sll %g1, 3, %g1
128 mov 64, %o3
129 andn %o1, 0x7, %o1
130 LOAD(ldx, %o1, %g2)
131 sub %o3, %g1, %o3
132 sllx %g2, %g1, %g2
133
134 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
135 LOAD(ldx, SRC, TMP1); \
136 srlx TMP1, PRE_SHIFT, TMP2; \
137 or TMP2, PRE_VAL, TMP2; \
138 STORE_INIT(TMP2, DST); \
139 sllx TMP1, POST_SHIFT, PRE_VAL;
140
141 1: add %o1, 0x8, %o1
142 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
143 add %o1, 0x8, %o1
144 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
145 add %o1, 0x8, %o1
146 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
147 add %o1, 0x8, %o1
148 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
149 add %o1, 32, %o1
150 LOAD(prefetch, %o1, #one_read)
151 sub %o1, 32 - 8, %o1
152 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
153 add %o1, 8, %o1
154 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
155 add %o1, 8, %o1
156 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
157 add %o1, 8, %o1
158 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
159 subcc %o4, 64, %o4
160 bne,pt %XCC, 1b
161 add %o0, 64, %o0
162
163 #undef SWIVEL_ONE_DWORD
164
165 srl %g1, 3, %g1
166 ba,pt %XCC, 60f
167 add %o1, %g1, %o1
168
169 10: /* Destination is 64-byte aligned, source was only 8-byte
170 * aligned but it has been subtracted by 8 and we perform
171 * one twin load ahead, then add 8 back into source when
172 * we finish the loop.
173 */
174 LOAD_TWIN(%o1, %o4, %o5)
175 1: add %o1, 16, %o1
176 LOAD_TWIN(%o1, %g2, %g3)
177 add %o1, 16 + 32, %o1
178 LOAD(prefetch, %o1, #one_read)
179 sub %o1, 32, %o1
180 STORE_INIT(%o5, %o0 + 0x00) ! initializes cache line
181 STORE_INIT(%g2, %o0 + 0x08)
182 LOAD_TWIN(%o1, %o4, %o5)
183 add %o1, 16, %o1
184 STORE_INIT(%g3, %o0 + 0x10)
185 STORE_INIT(%o4, %o0 + 0x18)
186 LOAD_TWIN(%o1, %g2, %g3)
187 add %o1, 16, %o1
188 STORE_INIT(%o5, %o0 + 0x20)
189 STORE_INIT(%g2, %o0 + 0x28)
190 LOAD_TWIN(%o1, %o4, %o5)
191 STORE_INIT(%g3, %o0 + 0x30)
192 STORE_INIT(%o4, %o0 + 0x38)
193 subcc %g1, 64, %g1
194 bne,pt %XCC, 1b
195 add %o0, 64, %o0
196
197 ba,pt %XCC, 60f
198 add %o1, 0x8, %o1
199
200 50: /* Destination is 64-byte aligned, and source is 16-byte
201 * aligned.
202 */
203 1: LOAD_TWIN(%o1, %o4, %o5)
204 add %o1, 16, %o1
205 LOAD_TWIN(%o1, %g2, %g3)
206 add %o1, 16 + 32, %o1
207 LOAD(prefetch, %o1, #one_read)
208 sub %o1, 32, %o1
209 STORE_INIT(%o4, %o0 + 0x00) ! initializes cache line
210 STORE_INIT(%o5, %o0 + 0x08)
211 LOAD_TWIN(%o1, %o4, %o5)
212 add %o1, 16, %o1
213 STORE_INIT(%g2, %o0 + 0x10)
214 STORE_INIT(%g3, %o0 + 0x18)
215 LOAD_TWIN(%o1, %g2, %g3)
216 add %o1, 16, %o1
217 STORE_INIT(%o4, %o0 + 0x20)
218 STORE_INIT(%o5, %o0 + 0x28)
219 STORE_INIT(%g2, %o0 + 0x30)
220 STORE_INIT(%g3, %o0 + 0x38)
221 subcc %g1, 64, %g1
222 bne,pt %XCC, 1b
223 add %o0, 64, %o0
224 /* fall through */
225
226 60:
227 /* %o2 contains any final bytes still needed to be copied
228 * over. If anything is left, we copy it one byte at a time.
229 */
230 wr %g0, ASI_PNF, %asi
231 brz,pt %o2, 85f
232 sub %o0, %o1, %o3
233 ba,a,pt %XCC, 90f
234
235 .align 64
236 70: /* 16 < len <= 64 */
237 bne,pn %XCC, 75f
238 sub %o0, %o1, %o3
239
240 72:
241 andn %o2, 0xf, %o4
242 and %o2, 0xf, %o2
243 1: subcc %o4, 0x10, %o4
244 LOAD(ldx, %o1, %o5)
245 add %o1, 0x08, %o1
246 LOAD(ldx, %o1, %g1)
247 sub %o1, 0x08, %o1
248 STORE(stx, %o5, %o1 + %o3)
249 add %o1, 0x8, %o1
250 STORE(stx, %g1, %o1 + %o3)
251 bgu,pt %XCC, 1b
252 add %o1, 0x8, %o1
253 73: andcc %o2, 0x8, %g0
254 be,pt %XCC, 1f
255 nop
256 sub %o2, 0x8, %o2
257 LOAD(ldx, %o1, %o5)
258 STORE(stx, %o5, %o1 + %o3)
259 add %o1, 0x8, %o1
260 1: andcc %o2, 0x4, %g0
261 be,pt %XCC, 1f
262 nop
263 sub %o2, 0x4, %o2
264 LOAD(lduw, %o1, %o5)
265 STORE(stw, %o5, %o1 + %o3)
266 add %o1, 0x4, %o1
267 1: cmp %o2, 0
268 be,pt %XCC, 85f
269 nop
270 ba,pt %XCC, 90f
271 nop
272
273 75:
274 andcc %o0, 0x7, %g1
275 sub %g1, 0x8, %g1
276 be,pn %icc, 2f
277 sub %g0, %g1, %g1
278 sub %o2, %g1, %o2
279
280 1: subcc %g1, 1, %g1
281 LOAD(ldub, %o1, %o5)
282 STORE(stb, %o5, %o1 + %o3)
283 bgu,pt %icc, 1b
284 add %o1, 1, %o1
285
286 2: add %o1, %o3, %o0
287 andcc %o1, 0x7, %g1
288 bne,pt %icc, 8f
289 sll %g1, 3, %g1
290
291 cmp %o2, 16
292 bgeu,pt %icc, 72b
293 nop
294 ba,a,pt %XCC, 73b
295
296 8: mov 64, %o3
297 andn %o1, 0x7, %o1
298 LOAD(ldx, %o1, %g2)
299 sub %o3, %g1, %o3
300 andn %o2, 0x7, %o4
301 sllx %g2, %g1, %g2
302 1: add %o1, 0x8, %o1
303 LOAD(ldx, %o1, %g3)
304 subcc %o4, 0x8, %o4
305 srlx %g3, %o3, %o5
306 or %o5, %g2, %o5
307 STORE(stx, %o5, %o0)
308 add %o0, 0x8, %o0
309 bgu,pt %icc, 1b
310 sllx %g3, %g1, %g2
311
312 srl %g1, 3, %g1
313 andcc %o2, 0x7, %o2
314 be,pn %icc, 85f
315 add %o1, %g1, %o1
316 ba,pt %XCC, 90f
317 sub %o0, %o1, %o3
318
319 .align 64
320 80: /* 0 < len <= 16 */
321 andcc %o3, 0x3, %g0
322 bne,pn %XCC, 90f
323 sub %o0, %o1, %o3
324
325 1:
326 subcc %o2, 4, %o2
327 LOAD(lduw, %o1, %g1)
328 STORE(stw, %g1, %o1 + %o3)
329 bgu,pt %XCC, 1b
330 add %o1, 4, %o1
331
332 85: retl
333 mov %g5, %o0
334
335 .align 32
336 90:
337 subcc %o2, 1, %o2
338 LOAD(ldub, %o1, %g1)
339 STORE(stb, %g1, %o1 + %o3)
340 bgu,pt %XCC, 90b
341 add %o1, 1, %o1
342 retl
343 mov %g5, %o0
344
345 END(__memcpy_niagara1)
346
347 #endif