]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
sparc: Add multiarch support for memset/bzero/memcpy.
[thirdparty/glibc.git] / sysdeps / sparc / sparc64 / multiarch / memcpy-niagara1.S
1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara.
2 Copyright (C) 2006, 2008 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 #include <sysdep.h>
22
23 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
24 #define ASI_P 0x80
25 #define ASI_PNF 0x82
26
27 #define LOAD(type,addr,dest) type##a [addr] ASI_P, dest
28 #define LOAD_TWIN(addr_reg,dest0,dest1) \
29 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
30
31 #define STORE(type,src,addr) type src, [addr]
32 #define STORE_INIT(src,addr) stxa src, [addr] %asi
33
34 #ifndef XCC
35 #define USE_BPR
36 #define XCC xcc
37 #endif
38
39 #if !defined NOT_IN_libc
40
41 .register %g2,#scratch
42 .register %g3,#scratch
43 .register %g6,#scratch
44
45 .text
46
47 .align 32
48 ENTRY(__memcpy_niagara1)
49 # ifndef USE_BPR
50 srl %o2, 0, %o2
51 # endif
52 100: /* %o0=dst, %o1=src, %o2=len */
53 mov %o0, %g5
54 cmp %o2, 0
55 be,pn %XCC, 85f
56 218: or %o0, %o1, %o3
57 cmp %o2, 16
58 blu,a,pn %XCC, 80f
59 or %o3, %o2, %o3
60
61 /* 2 blocks (128 bytes) is the minimum we can do the block
62 * copy with. We need to ensure that we'll iterate at least
63 * once in the block copy loop. At worst we'll need to align
64 * the destination to a 64-byte boundary which can chew up
65 * to (64 - 1) bytes from the length before we perform the
66 * block copy loop.
67 */
68 cmp %o2, (2 * 64)
69 blu,pt %XCC, 70f
70 andcc %o3, 0x7, %g0
71
72 /* %o0: dst
73 * %o1: src
74 * %o2: len (known to be >= 128)
75 *
76 * The block copy loops will use %o4/%o5,%g2/%g3 as
77 * temporaries while copying the data.
78 */
79
80 LOAD(prefetch, %o1, #one_read)
81 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
82
83 /* Align destination on 64-byte boundary. */
84 andcc %o0, (64 - 1), %o4
85 be,pt %XCC, 2f
86 sub %o4, 64, %o4
87 sub %g0, %o4, %o4 ! bytes to align dst
88 sub %o2, %o4, %o2
89 1: subcc %o4, 1, %o4
90 LOAD(ldub, %o1, %g1)
91 STORE(stb, %g1, %o0)
92 add %o1, 1, %o1
93 bne,pt %XCC, 1b
94 add %o0, 1, %o0
95
96 /* If the source is on a 16-byte boundary we can do
97 * the direct block copy loop. If it is 8-byte aligned
98 * we can do the 16-byte loads offset by -8 bytes and the
99 * init stores offset by one register.
100 *
101 * If the source is not even 8-byte aligned, we need to do
102 * shifting and masking (basically integer faligndata).
103 *
104 * The careful bit with init stores is that if we store
105 * to any part of the cache line we have to store the whole
106 * cacheline else we can end up with corrupt L2 cache line
107 * contents. Since the loop works on 64-bytes of 64-byte
108 * aligned store data at a time, this is easy to ensure.
109 */
110 2:
111 andcc %o1, (16 - 1), %o4
112 andn %o2, (64 - 1), %g1 ! block copy loop iterator
113 sub %o2, %g1, %o2 ! final sub-block copy bytes
114 be,pt %XCC, 50f
115 cmp %o4, 8
116 be,a,pt %XCC, 10f
117 sub %o1, 0x8, %o1
118
119 /* Neither 8-byte nor 16-byte aligned, shift and mask. */
120 mov %g1, %o4
121 and %o1, 0x7, %g1
122 sll %g1, 3, %g1
123 mov 64, %o3
124 andn %o1, 0x7, %o1
125 LOAD(ldx, %o1, %g2)
126 sub %o3, %g1, %o3
127 sllx %g2, %g1, %g2
128
129 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
130 LOAD(ldx, SRC, TMP1); \
131 srlx TMP1, PRE_SHIFT, TMP2; \
132 or TMP2, PRE_VAL, TMP2; \
133 STORE_INIT(TMP2, DST); \
134 sllx TMP1, POST_SHIFT, PRE_VAL;
135
136 1: add %o1, 0x8, %o1
137 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
138 add %o1, 0x8, %o1
139 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
140 add %o1, 0x8, %o1
141 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
142 add %o1, 0x8, %o1
143 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
144 add %o1, 32, %o1
145 LOAD(prefetch, %o1, #one_read)
146 sub %o1, 32 - 8, %o1
147 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
148 add %o1, 8, %o1
149 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
150 add %o1, 8, %o1
151 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
152 add %o1, 8, %o1
153 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
154 subcc %o4, 64, %o4
155 bne,pt %XCC, 1b
156 add %o0, 64, %o0
157
158 #undef SWIVEL_ONE_DWORD
159
160 srl %g1, 3, %g1
161 ba,pt %XCC, 60f
162 add %o1, %g1, %o1
163
164 10: /* Destination is 64-byte aligned, source was only 8-byte
165 * aligned but it has been subtracted by 8 and we perform
166 * one twin load ahead, then add 8 back into source when
167 * we finish the loop.
168 */
169 LOAD_TWIN(%o1, %o4, %o5)
170 1: add %o1, 16, %o1
171 LOAD_TWIN(%o1, %g2, %g3)
172 add %o1, 16 + 32, %o1
173 LOAD(prefetch, %o1, #one_read)
174 sub %o1, 32, %o1
175 STORE_INIT(%o5, %o0 + 0x00) ! initializes cache line
176 STORE_INIT(%g2, %o0 + 0x08)
177 LOAD_TWIN(%o1, %o4, %o5)
178 add %o1, 16, %o1
179 STORE_INIT(%g3, %o0 + 0x10)
180 STORE_INIT(%o4, %o0 + 0x18)
181 LOAD_TWIN(%o1, %g2, %g3)
182 add %o1, 16, %o1
183 STORE_INIT(%o5, %o0 + 0x20)
184 STORE_INIT(%g2, %o0 + 0x28)
185 LOAD_TWIN(%o1, %o4, %o5)
186 STORE_INIT(%g3, %o0 + 0x30)
187 STORE_INIT(%o4, %o0 + 0x38)
188 subcc %g1, 64, %g1
189 bne,pt %XCC, 1b
190 add %o0, 64, %o0
191
192 ba,pt %XCC, 60f
193 add %o1, 0x8, %o1
194
195 50: /* Destination is 64-byte aligned, and source is 16-byte
196 * aligned.
197 */
198 1: LOAD_TWIN(%o1, %o4, %o5)
199 add %o1, 16, %o1
200 LOAD_TWIN(%o1, %g2, %g3)
201 add %o1, 16 + 32, %o1
202 LOAD(prefetch, %o1, #one_read)
203 sub %o1, 32, %o1
204 STORE_INIT(%o4, %o0 + 0x00) ! initializes cache line
205 STORE_INIT(%o5, %o0 + 0x08)
206 LOAD_TWIN(%o1, %o4, %o5)
207 add %o1, 16, %o1
208 STORE_INIT(%g2, %o0 + 0x10)
209 STORE_INIT(%g3, %o0 + 0x18)
210 LOAD_TWIN(%o1, %g2, %g3)
211 add %o1, 16, %o1
212 STORE_INIT(%o4, %o0 + 0x20)
213 STORE_INIT(%o5, %o0 + 0x28)
214 STORE_INIT(%g2, %o0 + 0x30)
215 STORE_INIT(%g3, %o0 + 0x38)
216 subcc %g1, 64, %g1
217 bne,pt %XCC, 1b
218 add %o0, 64, %o0
219 /* fall through */
220
221 60:
222 /* %o2 contains any final bytes still needed to be copied
223 * over. If anything is left, we copy it one byte at a time.
224 */
225 wr %g0, ASI_PNF, %asi
226 brz,pt %o2, 85f
227 sub %o0, %o1, %o3
228 ba,a,pt %XCC, 90f
229
230 .align 64
231 70: /* 16 < len <= 64 */
232 bne,pn %XCC, 75f
233 sub %o0, %o1, %o3
234
235 72:
236 andn %o2, 0xf, %o4
237 and %o2, 0xf, %o2
238 1: subcc %o4, 0x10, %o4
239 LOAD(ldx, %o1, %o5)
240 add %o1, 0x08, %o1
241 LOAD(ldx, %o1, %g1)
242 sub %o1, 0x08, %o1
243 STORE(stx, %o5, %o1 + %o3)
244 add %o1, 0x8, %o1
245 STORE(stx, %g1, %o1 + %o3)
246 bgu,pt %XCC, 1b
247 add %o1, 0x8, %o1
248 73: andcc %o2, 0x8, %g0
249 be,pt %XCC, 1f
250 nop
251 sub %o2, 0x8, %o2
252 LOAD(ldx, %o1, %o5)
253 STORE(stx, %o5, %o1 + %o3)
254 add %o1, 0x8, %o1
255 1: andcc %o2, 0x4, %g0
256 be,pt %XCC, 1f
257 nop
258 sub %o2, 0x4, %o2
259 LOAD(lduw, %o1, %o5)
260 STORE(stw, %o5, %o1 + %o3)
261 add %o1, 0x4, %o1
262 1: cmp %o2, 0
263 be,pt %XCC, 85f
264 nop
265 ba,pt %XCC, 90f
266 nop
267
268 75:
269 andcc %o0, 0x7, %g1
270 sub %g1, 0x8, %g1
271 be,pn %icc, 2f
272 sub %g0, %g1, %g1
273 sub %o2, %g1, %o2
274
275 1: subcc %g1, 1, %g1
276 LOAD(ldub, %o1, %o5)
277 STORE(stb, %o5, %o1 + %o3)
278 bgu,pt %icc, 1b
279 add %o1, 1, %o1
280
281 2: add %o1, %o3, %o0
282 andcc %o1, 0x7, %g1
283 bne,pt %icc, 8f
284 sll %g1, 3, %g1
285
286 cmp %o2, 16
287 bgeu,pt %icc, 72b
288 nop
289 ba,a,pt %XCC, 73b
290
291 8: mov 64, %o3
292 andn %o1, 0x7, %o1
293 LOAD(ldx, %o1, %g2)
294 sub %o3, %g1, %o3
295 andn %o2, 0x7, %o4
296 sllx %g2, %g1, %g2
297 1: add %o1, 0x8, %o1
298 LOAD(ldx, %o1, %g3)
299 subcc %o4, 0x8, %o4
300 srlx %g3, %o3, %o5
301 or %o5, %g2, %o5
302 STORE(stx, %o5, %o0)
303 add %o0, 0x8, %o0
304 bgu,pt %icc, 1b
305 sllx %g3, %g1, %g2
306
307 srl %g1, 3, %g1
308 andcc %o2, 0x7, %o2
309 be,pn %icc, 85f
310 add %o1, %g1, %o1
311 ba,pt %XCC, 90f
312 sub %o0, %o1, %o3
313
314 .align 64
315 80: /* 0 < len <= 16 */
316 andcc %o3, 0x3, %g0
317 bne,pn %XCC, 90f
318 sub %o0, %o1, %o3
319
320 1:
321 subcc %o2, 4, %o2
322 LOAD(lduw, %o1, %g1)
323 STORE(stw, %g1, %o1 + %o3)
324 bgu,pt %XCC, 1b
325 add %o1, 4, %o1
326
327 85: retl
328 mov %g5, %o0
329
330 .align 32
331 90:
332 subcc %o2, 1, %o2
333 LOAD(ldub, %o1, %g1)
334 STORE(stb, %g1, %o1 + %o3)
335 bgu,pt %XCC, 90b
336 add %o1, 1, %o1
337 retl
338 mov %g5, %o0
339
340 END(__memcpy_niagara1)
341
342 #endif