]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / sparc / sparc64 / multiarch / memcpy-memmove-niagara7.S
CommitLineData
1b6e07f8 1/* Copy SIZE bytes from SRC to DEST. For SUN4V M7.
2b778ceb 2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
1b6e07f8
PM
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
1b6e07f8
PM
18
19#include <sysdep.h>
20
21#ifndef XCC
22# define XCC xcc
23#endif
24 .register %g2,#scratch
25 .register %g3,#scratch
26 .register %g6,#scratch
27
28#define FPRS_FEF 0x04
29
30/*
31 * ASI_STBI_P marks the cache line as "least recently used"
32 * which means if many threads are active, it has a high chance
33 * of being pushed out of the cache between the first initializing
34 * store and the final stores.
35 * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
36 * cache line as "most recently used" for all but the last cache
37 * line.
38 */
39
40#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
41#define ASI_ST_BLK_INIT_MRU_P 0xf2
42
43#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P
44#define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P
45
46#define BLOCK_SIZE 64 /* L2 data cache line size */
47#define SHORTCOPY 3
48#define SHORTCHECK 14
49#define SHORT_LONG 64 /* max copy for short longword-aligned case */
50 /* must be at least 64 */
51#define SMALL_MAX 255 /* max small copy for word/long aligned */
52#define SMALL_UMAX 128 /* max small copy for unaligned case */
53#define MED_WMAX 1023 /* max copy for medium word-aligned case */
54#define MED_MAX 511 /* max copy for medium longword-aligned case */
55#define ST_CHUNK 20 /* ST_CHUNK - block of values for BIS Store */
56/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
57 * prefetch 20 can cause inst pipeline to delay if data is in memory
58 * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache */
59#define ALIGN_PRE 20 /* distance for aligned prefetch loop */
60
61#define EX_ST(x) x
62#define EX_RETVAL(x) x
63#define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P
64#define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P
65
66#if IS_IN (libc)
67
68 .text
69
70ENTRY(__memmove_niagara7)
71 /* %o0=dst, %o1=src, %o2=len */
72 cmp %o1, %o0 /* if from address is >= to use forward copy */
73 bgeu,pn %XCC, .Lforcpy /* else use backward if ... */
74 sub %o0, %o1, %o4 /* get difference of two addresses */
75 cmp %o2, %o4 /* compare size and difference of addresses */
76 bleu,pn %XCC, .Lforcpy /* if size is bigger, do overlapped copy */
77 add %o1, %o2, %o5 /* get to end of source space */
78
79/* an overlapped copy that must be done "backwards" */
80.Lchksize:
81 cmp %o2, 8 /* less than 8 byte do byte copy */
82 blu,pn %XCC, 2f /* else continue */
83
84/* Now size is bigger than 8 */
85.Ldbalign:
86 add %o0, %o2, %g1 /* get to end of dest space */
87 andcc %g1, 7, %o3 /* %o3 has cnt til dst 8 byte align */
88 bz,a,pn %XCC, .Ldbbck /* skip if dst is 8 byte aligned */
89 andn %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */
90 sub %o2, %o3, %o2 /* update o2 with new count */
91
921: dec %o5 /* decrement source */
93 ldub [%o5], %g1 /* load one byte */
94 deccc %o3 /* decrement count */
95 bgu,pt %XCC, 1b /* if not done keep copying */
96 stb %g1, [%o5+%o4] /* store one byte into dest */
97 andncc %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */
98 bz,pn %XCC, 2f /* if size < 8, move to byte copy */
99
100/* Now Destination is 8 byte aligned */
101.Ldbbck:
102 andcc %o5, 7, %o0 /* %o0 has src offset */
103 bz,a,pn %XCC, .Ldbcopybc /* if src is aligned do fast memmove */
104 sub %o2, %o3, %o2 /* Residue bytes in %o2 */
105
106.Lcpy_dbwdbc: /* alignment of src is needed */
107 sub %o2, 8, %o2 /* set size one loop ahead */
108 sll %o0, 3, %g1 /* %g1 is left shift */
109 mov 64, %g5 /* init %g5 to be 64 */
110 sub %g5, %g1, %g5 /* %g5 rightshift = (64 - leftshift) */
111 sub %o5, %o0, %o5 /* align the src at 8 bytes. */
112 add %o4, %o0, %o4 /* increase diff between src & dst */
113 ldx [%o5], %o1 /* load first 8 bytes */
114 srlx %o1, %g5, %o1
1151: sub %o5, 8, %o5 /* subtract 8 from src */
116 ldx [%o5], %o0 /* load 8 byte */
117 sllx %o0, %g1, %o3 /* shift loaded val left to tmp reg */
118 or %o1, %o3, %o3 /* align data */
119 stx %o3, [%o5+%o4] /* store 8 byte */
120 subcc %o2, 8, %o2 /* subtract 8 byte from size */
121 bg,pt %XCC, 1b /* if size > 0 continue */
122 srlx %o0, %g5, %o1 /* move extra byte for the next use */
123
124 srl %g1, 3, %o0 /* restore %o0 value for alignment */
125 add %o5, %o0, %o5 /* restore src alignment */
126 sub %o4, %o0, %o4 /* restore diff between src & dest */
127
128 ba 2f /* branch to the trailing byte copy */
129 add %o2, 8, %o2 /* restore size value */
130
131.Ldbcopybc: /* alignment of src is not needed */
1321: sub %o5, 8, %o5 /* subtract from src */
133 ldx [%o5], %g1 /* load 8 bytes */
134 subcc %o3, 8, %o3 /* subtract from size */
135 bgu,pt %XCC, 1b /* if size is bigger 0 continue */
136 stx %g1, [%o5+%o4] /* store 8 bytes to destination */
137
138 ba 2f
139 nop
140
141.Lbcbyte:
1421: ldub [%o5], %g1 /* load one byte */
143 stb %g1, [%o5+%o4] /* store one byte */
1442: deccc %o2 /* decrement size */
145 bgeu,a,pt %XCC, 1b /* if size is >= 0 continue */
146 dec %o5 /* decrement from address */
147
148.Lexitbc: /* exit from backward copy */
149 retl
150 add %o5, %o4, %o0 /* restore dest addr */
151
152
153/* Check to see if memmove is large aligned copy
154 * If so, use special version of copy that avoids
155 * use of block store init. */
156.Lforcpy:
157 cmp %o2, SMALL_MAX /* check for not small case */
158 blt,pn %XCC, .Lmv_short /* merge with memcpy */
159 mov %o0, %g1 /* save %o0 */
160 neg %o0, %o5
161 andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */
162 brz,pt %o5, .Lmv_dst_aligned_on_8
163
164/* %o5 has the bytes to be written in partial store. */
165 sub %o2, %o5, %o2
166 sub %o1, %o0, %o1 /* %o1 gets the difference */
1677: /* dst aligning loop */
168 ldub [%o1+%o0], %o4 /* load one byte */
169 subcc %o5, 1, %o5
170 stb %o4, [%o0]
171 bgu,pt %XCC, 7b
172 add %o0, 1, %o0 /* advance dst */
173 add %o1, %o0, %o1 /* restore %o1 */
174.Lmv_dst_aligned_on_8:
175 andcc %o1, 7, %o5
176 brnz,pn %o5, .Lsrc_dst_unaligned_on_8
177 prefetch [%o1 + (1 * BLOCK_SIZE)], 20
178
179.Lmv_src_dst_aligned_on_8:
180/* check if we are copying MED_MAX or more bytes */
181 cmp %o2, MED_MAX /* limit to store buffer size */
182 bleu,pt %XCC, .Lmedlong
183 prefetch [%o1 + (2 * BLOCK_SIZE)], 20
184
185/* The mv_align loop below mimics the memcpy code for large aligned copies,
186 * but does not use the ASI_STBI_P (block initializing store) performance
187 * optimization. This is used when memcpy is incorrectly invoked with
188 * overlapping buffers. */
189
190.Lmv_large_align8_copy: /* Src and dst share 8 byte align */
191 /* align dst to 64 byte boundary */
192 andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */
193 brz,pn %o3, .Lmv_aligned_on_64
194 sub %o3, 64, %o3 /* %o3 has negative bytes to move */
195 add %o2, %o3, %o2 /* adjust remaining count */
196.Lmv_align_to_64:
197 ldx [%o1], %o4
198 add %o1, 8, %o1 /* increment src ptr */
199 addcc %o3, 8, %o3
200 stx %o4, [%o0]
201 brnz,pt %o3, .Lmv_align_to_64
202 add %o0, 8, %o0 /* increment dst ptr */
203
204.Lmv_aligned_on_64:
205 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */
206 and %o2, 0x3f, %o2 /* residue bytes in %o2 */
207.Lmv_align_loop:
208 ldx [%o1],%o4
209 stx %o4,[%o0]
210 prefetch [%o0 + (10 * BLOCK_SIZE)], 22
211 prefetch [%o1 + (10 * BLOCK_SIZE)], 21
212 subcc %o5, 64, %o5
213 ldx [%o1+8],%o4
214 stx %o4,[%o0+8]
215 ldx [%o1+16],%o4
216 stx %o4,[%o0+16]
217 ldx [%o1+24],%o4
218 stx %o4,[%o0+24]
219 ldx [%o1+32],%o4
220 stx %o4,[%o0+32]
221 ldx [%o1+40],%o4
222 stx %o4,[%o0+40]
223 ldx [%o1+48],%o4
224 add %o1, 64, %o1
225 stx %o4,[%o0+48]
226 add %o0, 64, %o0
227 ldx [%o1-8],%o4
228 bgt,pt %XCC, .Lmv_align_loop
229 stx %o4,[%o0-8]
230
231 ba .Lmedlong
232 nop
233END(__memmove_niagara7)
234
235ENTRY(__mempcpy_niagara7)
236 /* %o0=dst, %o1=src, %o2=len */
237 ba,pt %icc, 101f
238 add %o0, %o2, %g1 /* save dst + len */
239END(__mempcpy_niagara7)
240
241 .align 32
242ENTRY(__memcpy_niagara7)
243100: /* %o0=dst, %o1=src, %o2=len */
244 mov %o0, %g1 /* save %o0 */
245101:
246#ifndef __arch64__
247 srl %o2, 0, %o2
248#endif
249 cmp %o2, SMALL_MAX /* check for not small case */
250 bgeu,pn %XCC, .Lmedium /* go to larger cases */
251.Lmv_short:
252 cmp %o2, SHORTCOPY /* check for really short case */
253 ble,pn %XCC, .Lsmallfin
254 or %o0, %o1, %o4 /* prepare alignment check */
255 andcc %o4, 0x3, %o5 /* test for word alignment */
256 bnz,pn %XCC, .Lsmallunalign /* branch to non-word aligned case */
257 nop
258 subcc %o2, 7, %o2 /* adjust count */
259 ble,pn %XCC, .Lsmallwordx
260 andcc %o4, 0x7, %o5 /* test for long alignment */
261/* 8 or more bytes, src and dest start on word boundary
262 * %o4 contains or %o0, %o1 */
263.Lsmalllong:
264 bnz,pn %XCC, .Lsmallwords /* branch to word aligned case */
265 cmp %o2, SHORT_LONG-7
266 bge,a %XCC, .Lmedl64 /* if we branch */
267 sub %o2,56,%o2 /* adjust %o2 to -63 off count */
268
269/* slightly unroll the small_long_loop to improve very short copies */
270 cmp %o2, 32-7
271 blt,a,pn %XCC, .Lsmall_long_l
272 sub %o1, %o0, %o1 /* %o1 gets the difference */
273
274 ldx [%o1], %o5
275 ldx [%o1+8], %o4
276 ldx [%o1+16], %o3
277
278 subcc %o2, 24, %o2
279 sub %o1, %o0, %o1 /* %o1 gets the difference */
280
281 stx %o5, [%o0] /* write word */
282 stx %o4, [%o0+8] /* write word */
283 stx %o3, [%o0+16] /* write word */
284
285 add %o0, 24, %o0
286
287/* end loop unroll */
288
289.Lsmall_long_l:
290 ldx [%o1+%o0], %o3
291 subcc %o2, 8, %o2
292 add %o0, 8, %o0
293 bgu,pn %XCC, .Lsmall_long_l /* loop until done */
294 stx %o3, [%o0-8] /* write word */
295 addcc %o2, 7, %o2 /* restore %o2 to correct count */
296 bnz,pn %XCC, .Lsmall_long_x /* check for completion */
297 add %o1, %o0, %o1 /* restore %o1 */
298 retl
299 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
300.Lsmall_long_x:
301 cmp %o2, 4 /* check for 4 or more bytes left */
302 blt,pn %XCC, .Lsmallleft3 /* if not, go to finish up */
303 nop
304 lduw [%o1], %o3
305 add %o1, 4, %o1
306 subcc %o2, 4, %o2
307 stw %o3, [%o0]
308 bnz,pn %XCC, .Lsmallleft3
309 add %o0, 4, %o0
310 retl
311 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
312
313 .align 32
314/* src and dest start on word boundary; 7 or fewer bytes */
315.Lsmallwordx:
316 lduw [%o1], %o3 /* read word */
317 addcc %o2, 3, %o2 /* restore count */
318 bz,pt %XCC, .Lsmallexit
319 stw %o3, [%o0] /* write word */
320 deccc %o2 /* reduce count for cc test */
321 ldub [%o1+4], %o3 /* load one byte */
322 bz,pt %XCC, .Lsmallexit
323 stb %o3, [%o0+4] /* store one byte */
324 ldub [%o1+5], %o3 /* load second byte */
325 deccc %o2
326 bz,pt %XCC, .Lsmallexit
327 stb %o3, [%o0+5] /* store second byte */
328 ldub [%o1+6], %o3 /* load third byte */
329 stb %o3, [%o0+6] /* store third byte */
330.Lsmallexit:
331 retl
332 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
333
334 .align 32
335.Lsmallunalign:
336 cmp %o2, SHORTCHECK
337 ble,pn %XCC, .Lsmallrest
338 cmp %o2, SMALL_UMAX
339 bge,pt %XCC, .Lmedium_join
340 andcc %o1, 0x3, %o5 /* is src word aligned */
341 bz,pn %XCC, .Laldst
342 cmp %o5, 2 /* is src half-word aligned */
343 be,pt %XCC, .Ls2algn
344 cmp %o5, 3 /* src is byte aligned */
345.Ls1algn:
346 ldub [%o1], %o3 /* move 1 or 3 bytes to align it */
347 inc 1, %o1
348 stb %o3, [%o0] /* move a byte to align src */
349 inc 1, %o0
350 bne,pt %XCC, .Ls2algn
351 dec %o2
352 b .Lald /* now go align dest */
353 andcc %o0, 0x3, %o5
354
355.Ls2algn:
356 lduh [%o1], %o3 /* know src is 2 byte aligned */
357 inc 2, %o1
358 srl %o3, 8, %o4
359 stb %o4, [%o0] /* have to do bytes, */
360 stb %o3, [%o0 + 1] /* do not know dst alignment */
361 inc 2, %o0
362 dec 2, %o2
363
364.Laldst:
365 andcc %o0, 0x3, %o5 /* align the destination address */
366.Lald:
367 bz,pn %XCC, .Lw4cp
368 cmp %o5, 2
369 be,pn %XCC, .Lw2cp
370 cmp %o5, 3
371.Lw3cp: lduw [%o1], %o4
372 inc 4, %o1
373 srl %o4, 24, %o5
374 stb %o5, [%o0]
375 bne,pt %XCC, .Lw1cp
376 inc %o0
377 dec 1, %o2
378 andn %o2, 3, %o3 /* %o3 is aligned word count */
379 dec 4, %o3 /* avoid reading beyond tail of src */
380 sub %o1, %o0, %o1 /* %o1 gets the difference */
381
3821: sll %o4, 8, %g5 /* save residual bytes */
383 lduw [%o1+%o0], %o4
384 deccc 4, %o3
385 srl %o4, 24, %o5 /* merge with residual */
386 or %o5, %g5, %g5
387 st %g5, [%o0]
388 bnz,pt %XCC, 1b
389 inc 4, %o0
390 sub %o1, 3, %o1 /* used one byte of last word read */
391 and %o2, 3, %o2
392 b 7f
393 inc 4, %o2
394
395.Lw1cp: srl %o4, 8, %o5
396 sth %o5, [%o0]
397 inc 2, %o0
398 dec 3, %o2
399 andn %o2, 3, %o3 /* %o3 is aligned word count */
400 dec 4, %o3 /* avoid reading beyond tail of src */
401 sub %o1, %o0, %o1 /* %o1 gets the difference */
402
4032: sll %o4, 24, %g5 /* save residual bytes */
404 lduw [%o1+%o0], %o4
405 deccc 4, %o3
406 srl %o4, 8, %o5 /* merge with residual */
407 or %o5, %g5, %g5
408 st %g5, [%o0]
409 bnz,pt %XCC, 2b
410 inc 4, %o0
411 sub %o1, 1, %o1 /* used 3 bytes of last word read */
412 and %o2, 3, %o2
413 b 7f
414 inc 4, %o2
415
416.Lw2cp: lduw [%o1], %o4
417 inc 4, %o1
418 srl %o4, 16, %o5
419 sth %o5, [%o0]
420 inc 2, %o0
421 dec 2, %o2
422 andn %o2, 3, %o3 /* %o3 is aligned word count */
423 dec 4, %o3 /* avoid reading beyond tail of src */
424 sub %o1, %o0, %o1 /* %o1 gets the difference */
425
4263: sll %o4, 16, %g5 /* save residual bytes */
427 lduw [%o1+%o0], %o4
428 deccc 4, %o3
429 srl %o4, 16, %o5 /* merge with residual */
430 or %o5, %g5, %g5
431 st %g5, [%o0]
432 bnz,pt %XCC, 3b
433 inc 4, %o0
434 sub %o1, 2, %o1 /* used two bytes of last word read */
435 and %o2, 3, %o2
436 b 7f
437 inc 4, %o2
438
439.Lw4cp: andn %o2, 3, %o3 /* %o3 is aligned word count */
440 sub %o1, %o0, %o1 /* %o1 gets the difference */
441
4421: lduw [%o1+%o0], %o4 /* read from address */
443 deccc 4, %o3 /* decrement count */
444 st %o4, [%o0] /* write at destination address */
445 bgu,pt %XCC, 1b
446 inc 4, %o0 /* increment to address */
447 and %o2, 3, %o2 /* number of leftover bytes, if any */
448
449 /* simple finish up byte copy, works with any alignment */
4507:
451 add %o1, %o0, %o1 /* restore %o1 */
452.Lsmallrest:
453 tst %o2
454 bz,pt %XCC, .Lsmallx
455 cmp %o2, 4
456 blt,pn %XCC, .Lsmallleft3
457 nop
458 sub %o2, 3, %o2
459.Lsmallnotalign4:
460 ldub [%o1], %o3 /* read byte */
461 subcc %o2, 4, %o2 /* reduce count by 4 */
462 stb %o3, [%o0] /* write byte */
463 ldub [%o1+1], %o3 /* repeat for total of 4 bytes */
464 add %o1, 4, %o1 /* advance SRC by 4 */
465 stb %o3, [%o0+1]
466 ldub [%o1-2], %o3
467 add %o0, 4, %o0 /* advance DST by 4 */
468 stb %o3, [%o0-2]
469 ldub [%o1-1], %o3
470 bgu,pt %XCC, .Lsmallnotalign4 /* loop til 3 or fewer bytes remain */
471 stb %o3, [%o0-1]
472 addcc %o2, 3, %o2 /* restore count */
473 bz,pt %XCC, .Lsmallx
474.Lsmallleft3: /* 1, 2, or 3 bytes remain */
475 subcc %o2, 1, %o2
476 ldub [%o1], %o3 /* load one byte */
477 bz,pt %XCC, .Lsmallx
478 stb %o3, [%o0] /* store one byte */
479 ldub [%o1+1], %o3 /* load second byte */
480 subcc %o2, 1, %o2
481 bz,pt %XCC, .Lsmallx
482 stb %o3, [%o0+1] /* store second byte */
483 ldub [%o1+2], %o3 /* load third byte */
484 stb %o3, [%o0+2] /* store third byte */
485.Lsmallx:
486 retl
487 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
488
489.Lsmallfin:
490 tst %o2
491 bnz,pn %XCC, .Lsmallleft3
492 nop
493 retl
494 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
495
496 .align 16
497.Lsmallwords:
498 lduw [%o1], %o3 /* read word */
499 subcc %o2, 8, %o2 /* update count */
500 stw %o3, [%o0] /* write word */
501 add %o1, 8, %o1 /* update SRC */
502 lduw [%o1-4], %o3 /* read word */
503 add %o0, 8, %o0 /* update DST */
504 bgu,pt %XCC, .Lsmallwords /* loop until done */
505 stw %o3, [%o0-4] /* write word */
506 addcc %o2, 7, %o2 /* restore count */
507 bz,pt %XCC, .Lsmallexit /* check for completion */
508 cmp %o2, 4 /* check for 4 or more bytes left */
509 blt,pt %XCC, .Lsmallleft3 /* if not, go to finish up */
510 nop
511 lduw [%o1], %o3
512 add %o1, 4, %o1
513 subcc %o2, 4, %o2
514 add %o0, 4, %o0
515 bnz,pn %XCC, .Lsmallleft3
516 stw %o3, [%o0-4]
517 retl
518 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
519
520 .align 16
521.Lmedium:
522.Lmedium_join:
523 neg %o0, %o5
524 andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */
525 brz,pt %o5, .Ldst_aligned_on_8
526
527 /* %o5 has the bytes to be written in partial store. */
528 sub %o2, %o5, %o2
529 sub %o1, %o0, %o1 /* %o1 gets the difference */
5307: /* dst aligning loop */
531 ldub [%o1+%o0], %o4 /* load one byte */
532 subcc %o5, 1, %o5
533 stb %o4, [%o0]
534 bgu,pt %XCC, 7b
535 add %o0, 1, %o0 /* advance dst */
536 add %o1, %o0, %o1 /* restore %o1 */
537.Ldst_aligned_on_8:
538 andcc %o1, 7, %o5
539 brnz,pt %o5, .Lsrc_dst_unaligned_on_8
540 nop
541
542.Lsrc_dst_aligned_on_8:
543 /* check if we are copying MED_MAX or more bytes */
544 cmp %o2, MED_MAX /* limit to store buffer size */
545 bgu,pn %XCC, .Llarge_align8_copy
546 nop
547/*
548 * Special case for handling when src and dest are both long word aligned
549 * and total data to move is less than MED_MAX bytes
550 */
551.Lmedlong:
552 subcc %o2, 63, %o2 /* adjust length to allow cc test */
553 ble,pn %XCC, .Lmedl63 /* skip big loop if < 64 bytes */
554 nop
555.Lmedl64:
556 ldx [%o1], %o4 /* load */
557 subcc %o2, 64, %o2 /* decrement length count */
558 stx %o4, [%o0] /* and store */
559 ldx [%o1+8], %o3 /* a block of 64 bytes */
560 stx %o3, [%o0+8]
561 ldx [%o1+16], %o4
562 stx %o4, [%o0+16]
563 ldx [%o1+24], %o3
564 stx %o3, [%o0+24]
565 ldx [%o1+32], %o4 /* load */
566 stx %o4, [%o0+32] /* and store */
567 ldx [%o1+40], %o3 /* a block of 64 bytes */
568 add %o1, 64, %o1 /* increase src ptr by 64 */
569 stx %o3, [%o0+40]
570 ldx [%o1-16], %o4
571 add %o0, 64, %o0 /* increase dst ptr by 64 */
572 stx %o4, [%o0-16]
573 ldx [%o1-8], %o3
574 bgu,pt %XCC, .Lmedl64 /* repeat if at least 64 bytes left */
575 stx %o3, [%o0-8]
576.Lmedl63:
577 addcc %o2, 32, %o2 /* adjust remaining count */
578 ble,pt %XCC, .Lmedl31 /* to skip if 31 or fewer bytes left */
579 nop
580 ldx [%o1], %o4 /* load */
581 sub %o2, 32, %o2 /* decrement length count */
582 stx %o4, [%o0] /* and store */
583 ldx [%o1+8], %o3 /* a block of 32 bytes */
584 add %o1, 32, %o1 /* increase src ptr by 32 */
585 stx %o3, [%o0+8]
586 ldx [%o1-16], %o4
587 add %o0, 32, %o0 /* increase dst ptr by 32 */
588 stx %o4, [%o0-16]
589 ldx [%o1-8], %o3
590 stx %o3, [%o0-8]
591.Lmedl31:
592 addcc %o2, 16, %o2 /* adjust remaining count */
593 ble,pt %XCC, .Lmedl15 /* skip if 15 or fewer bytes left */
594 nop
595 ldx [%o1], %o4 /* load and store 16 bytes */
596 add %o1, 16, %o1 /* increase src ptr by 16 */
597 stx %o4, [%o0]
598 sub %o2, 16, %o2 /* decrease count by 16 */
599 ldx [%o1-8], %o3
600 add %o0, 16, %o0 /* increase dst ptr by 16 */
601 stx %o3, [%o0-8]
602.Lmedl15:
603 addcc %o2, 15, %o2 /* restore count */
604 bz,pt %XCC, .Lsmallexit /* exit if finished */
605 cmp %o2, 8
606 blt,pt %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */
607 tst %o2
608 ldx [%o1], %o4 /* load 8 bytes */
609 add %o1, 8, %o1 /* increase src ptr by 8 */
610 add %o0, 8, %o0 /* increase dst ptr by 8 */
611 subcc %o2, 8, %o2 /* decrease count by 8 */
612 bnz,pn %XCC, .Lmedw7
613 stx %o4, [%o0-8] /* and store 8 bytes */
614 retl
615 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
616
617 .align 16
618.Lsrc_dst_unaligned_on_8:
619 /* DST is 8-byte aligned, src is not */
620 andcc %o1, 0x3, %o5 /* test word alignment */
621 bnz,pt %XCC, .Lunalignsetup /* branch if not word aligned */
622 nop
623
624/*
625 * Handle all cases where src and dest are aligned on word
626 * boundaries. Use unrolled loops for better performance.
627 * This option wins over standard large data move when
628 * source and destination is in cache for medium
629 * to short data moves.
630 */
631 cmp %o2, MED_WMAX /* limit to store buffer size */
632 bge,pt %XCC, .Lunalignrejoin /* otherwise rejoin main loop */
633 nop
634
635 subcc %o2, 31, %o2 /* adjust length to allow cc test */
636 /* for end of loop */
637 ble,pt %XCC, .Lmedw31 /* skip big loop if less than 16 */
638.Lmedw32:
639 ld [%o1], %o4 /* move a block of 32 bytes */
640 sllx %o4, 32, %o5
641 ld [%o1+4], %o4
642 or %o4, %o5, %o5
643 stx %o5, [%o0]
644 subcc %o2, 32, %o2 /* decrement length count */
645 ld [%o1+8], %o4
646 sllx %o4, 32, %o5
647 ld [%o1+12], %o4
648 or %o4, %o5, %o5
649 stx %o5, [%o0+8]
650 add %o1, 32, %o1 /* increase src ptr by 32 */
651 ld [%o1-16], %o4
652 sllx %o4, 32, %o5
653 ld [%o1-12], %o4
654 or %o4, %o5, %o5
655 stx %o5, [%o0+16]
656 add %o0, 32, %o0 /* increase dst ptr by 32 */
657 ld [%o1-8], %o4
658 sllx %o4, 32, %o5
659 ld [%o1-4], %o4
660 or %o4, %o5, %o5
661 bgu,pt %XCC, .Lmedw32 /* repeat if at least 32 bytes left */
662 stx %o5, [%o0-8]
663.Lmedw31:
664 addcc %o2, 31, %o2 /* restore count */
665 bz,pt %XCC, .Lsmallexit /* exit if finished */
666 cmp %o2, 16
667 blt,pt %XCC, .Lmedw15
668 nop
669 ld [%o1], %o4 /* move a block of 16 bytes */
670 sllx %o4, 32, %o5
671 subcc %o2, 16, %o2 /* decrement length count */
672 ld [%o1+4], %o4
673 or %o4, %o5, %o5
674 stx %o5, [%o0]
675 add %o1, 16, %o1 /* increase src ptr by 16 */
676 ld [%o1-8], %o4
677 add %o0, 16, %o0 /* increase dst ptr by 16 */
678 sllx %o4, 32, %o5
679 ld [%o1-4], %o4
680 or %o4, %o5, %o5
681 stx %o5, [%o0-8]
682.Lmedw15:
683 bz,pt %XCC, .Lsmallexit /* exit if finished */
684 cmp %o2, 8
685 blt,pn %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */
686 tst %o2
687 ld [%o1], %o4 /* load 4 bytes */
688 subcc %o2, 8, %o2 /* decrease count by 8 */
689 stw %o4, [%o0] /* and store 4 bytes */
690 add %o1, 8, %o1 /* increase src ptr by 8 */
691 ld [%o1-4], %o3 /* load 4 bytes */
692 add %o0, 8, %o0 /* increase dst ptr by 8 */
693 stw %o3, [%o0-4] /* and store 4 bytes */
694 bz,pt %XCC, .Lsmallexit /* exit if finished */
695.Lmedw7: /* count is ge 1, less than 8 */
696 cmp %o2, 4 /* check for 4 bytes left */
697 blt,pn %XCC, .Lsmallleft3 /* skip if 3 or fewer bytes left */
698 nop
699 ld [%o1], %o4 /* load 4 bytes */
700 add %o1, 4, %o1 /* increase src ptr by 4 */
701 add %o0, 4, %o0 /* increase dst ptr by 4 */
702 subcc %o2, 4, %o2 /* decrease count by 4 */
703 bnz,pt %XCC, .Lsmallleft3
704 stw %o4, [%o0-4] /* and store 4 bytes */
705 retl
706 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
707
708 .align 16
709.Llarge_align8_copy: /* Src and dst 8 byte aligned */
710 /* align dst to 64 byte boundary */
711 andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */
712 brz,pn %o3, .Laligned_to_64
713 andcc %o0, 8, %o3 /* odd long words to move? */
714 brz,pt %o3, .Laligned_to_16
715 nop
716 ldx [%o1], %o4
717 sub %o2, 8, %o2
718 add %o1, 8, %o1 /* increment src ptr */
719 add %o0, 8, %o0 /* increment dst ptr */
720 stx %o4, [%o0-8]
721.Laligned_to_16:
722 andcc %o0, 16, %o3 /* pair of long words to move? */
723 brz,pt %o3, .Laligned_to_32
724 nop
725 ldx [%o1], %o4
726 sub %o2, 16, %o2
727 stx %o4, [%o0]
728 add %o1, 16, %o1 /* increment src ptr */
729 ldx [%o1-8], %o4
730 add %o0, 16, %o0 /* increment dst ptr */
731 stx %o4, [%o0-8]
732.Laligned_to_32:
733 andcc %o0, 32, %o3 /* four long words to move? */
734 brz,pt %o3, .Laligned_to_64
735 nop
736 ldx [%o1], %o4
737 sub %o2, 32, %o2
738 stx %o4, [%o0]
739 ldx [%o1+8], %o4
740 stx %o4, [%o0+8]
741 ldx [%o1+16], %o4
742 stx %o4, [%o0+16]
743 add %o1, 32, %o1 /* increment src ptr */
744 ldx [%o1-8], %o4
745 add %o0, 32, %o0 /* increment dst ptr */
746 stx %o4, [%o0-8]
747.Laligned_to_64:
748/* Following test is included to avoid issues where existing executables
749 * incorrectly call memcpy with overlapping src and dest instead of memmove
750 *
751 * if ( (src ge dst) and (dst+len > src)) go to overlap case
752 * if ( (src lt dst) and (src+len > dst)) go to overlap case
753 */
754 cmp %o1,%o0
755 bge,pt %XCC, 1f
756 nop
757/* src+len > dst? */
758 add %o1, %o2, %o4
759 cmp %o4, %o0
760 bgt,pt %XCC, .Lmv_aligned_on_64
761 nop
762 ba 2f
763 nop
7641:
765/* dst+len > src? */
766 add %o0, %o2, %o4
767 cmp %o4, %o1
768 bgt,pt %XCC, .Lmv_aligned_on_64
769 nop
7702:
771/* handle non-overlapped copies
772 *
773 * Using block init store (BIS) instructions to avoid fetching cache
774 * lines from memory. Use ST_CHUNK stores to first element of each cache
775 * line (similar to prefetching) to avoid overfilling STQ or miss buffers.
776 * Gives existing cache lines time to be moved out of L1/L2/L3 cache.
777 */
778 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */
779 and %o2, 0x3f, %o2 /* residue bytes in %o2 */
780
781/* We use ASI_STBIMRU_P for the first store to each cache line
782 * followed by ASI_STBI_P (mark as LRU) for the last store. That
783 * mixed approach reduces the chances the cache line is removed
784 * before we finish setting it, while minimizing the effects on
785 * other cached values during a large memcpy
786 *
787 * Intermediate stores can be normal since first BIS activates the
788 * cache line in the L2 cache.
789 *
790 * ST_CHUNK batches up initial BIS operations for several cache lines
791 * to allow multiple requests to not be blocked by overflowing the
792 * the store miss buffer. Then the matching stores for all those
793 * BIS operations are executed.
794 */
795
796.Lalign_loop:
797 cmp %o5, ST_CHUNK*64
798 blu,pt %XCC, .Lalign_short
799 mov ST_CHUNK, %o3
800 sllx %o3, 6, %g5 /* ST_CHUNK*64 */
801
802.Lalign_loop_start:
803 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
804 subcc %o3, 2, %o3
805 ldx [%o1], %o4
806 add %o1, 128, %o1
807 EX_ST(STORE_ASI(%o4, %o0))
808 add %o0, 64, %o0
809 ldx [%o1-64], %o4
810 EX_ST(STORE_ASI(%o4, %o0))
811 add %o0, 64, %o0
812 bgu,pt %XCC, .Lalign_loop_start
813 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21
814
815 mov ST_CHUNK, %o3
816 sub %o1, %g5, %o1 /* reset %o1 */
817 sub %o0, %g5, %o0 /* reset %o0 */
818
819 sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */
820.Lalign_loop_rest:
821 ldx [%o1+8],%o4
822 add %o0, 64, %o0
823 stx %o4, [%o0-48]
824 subcc %o3, 1, %o3
825 ldx [%o1+16],%o4
826 stx %o4, [%o0-40]
827 sub %o5, 64, %o5
828 ldx [%o1+24],%o4
829 stx %o4, [%o0-32]
830 ldx [%o1+32],%o4
831 stx %o4, [%o0-24]
832 ldx [%o1+40],%o4
833 stx %o4, [%o0-16]
834 ldx [%o1+48],%o4
835 stx %o4, [%o0-8]
836 add %o1, 64, %o1
837 ldx [%o1-8],%o4
838 bgu,pt %XCC, .Lalign_loop_rest
839 EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */
840
841 mov ST_CHUNK, %o3
842 cmp %o5, ST_CHUNK*64
843 bgu,pt %XCC, .Lalign_loop_start
844 add %o0, 8, %o0 /* restore %o0 from ASI alignment */
845
846 cmp %o5, 0
847 beq,pt %XCC, .Lalign_done
848
849/* no prefetches needed in these loops
850 * since we are within ALIGN_PRE of the end */
851.Lalign_short:
852 srl %o5, 6, %o3
853.Lalign_loop_short:
854 subcc %o3, 1, %o3
855 ldx [%o1], %o4
856 add %o1, 64, %o1
857 EX_ST(STORE_ASI(%o4, %o0))
858 bgu,pt %XCC, .Lalign_loop_short
859 add %o0, 64, %o0
860
861 sub %o1, %o5, %o1 /* reset %o1 */
862 sub %o0, %o5, %o0 /* reset %o0 */
863
864 sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */
865.Lalign_short_rest:
866 ldx [%o1+8],%o4
867 add %o0, 64, %o0
868 stx %o4, [%o0-48]
869 ldx [%o1+16],%o4
870 subcc %o5, 64, %o5
871 stx %o4, [%o0-40]
872 ldx [%o1+24],%o4
873 stx %o4, [%o0-32]
874 ldx [%o1+32],%o4
875 stx %o4, [%o0-24]
876 ldx [%o1+40],%o4
877 stx %o4, [%o0-16]
878 ldx [%o1+48],%o4
879 stx %o4, [%o0-8]
880 add %o1, 64, %o1
881 ldx [%o1-8],%o4
882 bgu,pt %XCC, .Lalign_short_rest
883 EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */
884
885 add %o0, 8, %o0 /* restore %o0 from ASI alignment */
886
887.Lalign_done:
888 cmp %o2, 0
889 membar #StoreStore
890 bne,pt %XCC, .Lmedl63
891 subcc %o2, 63, %o2 /* adjust length to allow cc test */
892 retl
893 mov EX_RETVAL(%g1), %o0 /* restore %o0 */
894
895 .align 16
896 /* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX */
897 /* Since block load/store and BIS are not in use for unaligned data,
898 * no need to align dst on 64 byte cache line boundary */
899.Lunalignsetup:
900.Lunalignrejoin:
901 rd %fprs, %g5 /* check for unused fp */
902 /* if fprs.fef == 0, set it.
903 * Setting it when already set costs more than checking */
904 andcc %g5, FPRS_FEF, %g5 /* test FEF, fprs.du = fprs.dl = 0 */
905 bz,a %XCC, 1f
906 wr %g0, FPRS_FEF, %fprs /* fprs.fef = 1 */
9071:
908 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */
909 and %o2, 0x3f, %o2 /* residue bytes in %o2 */
910 cmp %o2, 8 /* Insure we do not load beyond */
911 bgt,pt %XCC, .Lunalign_adjust /* end of source buffer */
912 andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */
913 add %o2, 64, %o2 /* adjust to leave loop */
914 sub %o5, 64, %o5 /* early if necessary */
915.Lunalign_adjust:
916 alignaddr %o1, %g0, %g0 /* generate %gsr */
917 add %o1, %o5, %o1 /* advance %o1 to after blocks */
918 ldd [%o4], %f0
919.Lunalign_loop:
920 prefetch [%o0 + (9 * BLOCK_SIZE)], 20
921 ldd [%o4+8], %f2
922 faligndata %f0, %f2, %f16
923 ldd [%o4+16], %f4
924 subcc %o5, BLOCK_SIZE, %o5
925 std %f16, [%o0]
926 faligndata %f2, %f4, %f18
927 ldd [%o4+24], %f6
928 std %f18, [%o0+8]
929 faligndata %f4, %f6, %f20
930 ldd [%o4+32], %f8
931 std %f20, [%o0+16]
932 faligndata %f6, %f8, %f22
933 ldd [%o4+40], %f10
934 std %f22, [%o0+24]
935 faligndata %f8, %f10, %f24
936 ldd [%o4+48], %f12
937 std %f24, [%o0+32]
938 faligndata %f10, %f12, %f26
939 ldd [%o4+56], %f14
940 add %o4, BLOCK_SIZE, %o4
941 std %f26, [%o0+40]
942 faligndata %f12, %f14, %f28
943 ldd [%o4], %f0
944 std %f28, [%o0+48]
945 faligndata %f14, %f0, %f30
946 std %f30, [%o0+56]
947 add %o0, BLOCK_SIZE, %o0
948 bgu,pt %XCC, .Lunalign_loop
949 prefetch [%o4 + (11 * BLOCK_SIZE)], 20
950
951 /* Handle trailing bytes, 64 to 127
952 * Dest long word aligned, Src not long word aligned */
953 cmp %o2, 15
954 bleu,pt %XCC, .Lunalign_short
955
956 andn %o2, 0x7, %o5 /* %o5 is multiple of 8 */
957 and %o2, 0x7, %o2 /* residue bytes in %o2 */
958 add %o2, 8, %o2
959 sub %o5, 8, %o5 /* do not load past end of src */
960 andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */
961 add %o1, %o5, %o1 /* move %o1 to after multiple of 8 */
962 ldd [%o4], %f0 /* fetch partial word */
963.Lunalign_by8:
964 ldd [%o4+8], %f2
965 add %o4, 8, %o4
966 faligndata %f0, %f2, %f16
967 subcc %o5, 8, %o5
968 std %f16, [%o0]
969 fsrc2 %f2, %f0
970 bgu,pt %XCC, .Lunalign_by8
971 add %o0, 8, %o0
972
973.Lunalign_short: /* restore fprs state */
974 brnz,pt %g5, .Lsmallrest
975 nop
976 ba .Lsmallrest
977 wr %g5, %g0, %fprs
978END(__memcpy_niagara7)
979
980#endif