]>
Commit | Line | Data |
---|---|---|
1b6e07f8 | 1 | /* Copy SIZE bytes from SRC to DEST. For SUN4V M7. |
2b778ceb | 2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
1b6e07f8 PM |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
1b6e07f8 PM |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | #ifndef XCC | |
22 | # define XCC xcc | |
23 | #endif | |
24 | .register %g2,#scratch | |
25 | .register %g3,#scratch | |
26 | .register %g6,#scratch | |
27 | ||
28 | #define FPRS_FEF 0x04 | |
29 | ||
30 | /* | |
31 | * ASI_STBI_P marks the cache line as "least recently used" | |
32 | * which means if many threads are active, it has a high chance | |
33 | * of being pushed out of the cache between the first initializing | |
34 | * store and the final stores. | |
35 | * Thus, in this algorithm we use ASI_STBIMRU_P which marks the | |
36 | * cache line as "most recently used" for all but the last cache | |
37 | * line. | |
38 | */ | |
39 | ||
40 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
41 | #define ASI_ST_BLK_INIT_MRU_P 0xf2 | |
42 | ||
43 | #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P | |
44 | #define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P | |
45 | ||
46 | #define BLOCK_SIZE 64 /* L2 data cache line size */ | |
47 | #define SHORTCOPY 3 | |
48 | #define SHORTCHECK 14 | |
49 | #define SHORT_LONG 64 /* max copy for short longword-aligned case */ | |
50 | /* must be at least 64 */ | |
51 | #define SMALL_MAX 255 /* max small copy for word/long aligned */ | |
52 | #define SMALL_UMAX 128 /* max small copy for unaligned case */ | |
53 | #define MED_WMAX 1023 /* max copy for medium word-aligned case */ | |
54 | #define MED_MAX 511 /* max copy for medium longword-aligned case */ | |
55 | #define ST_CHUNK 20 /* ST_CHUNK - block of values for BIS Store */ | |
56 | /* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache | |
57 | * prefetch 20 can cause inst pipeline to delay if data is in memory | |
58 | * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache */ | |
59 | #define ALIGN_PRE 20 /* distance for aligned prefetch loop */ | |
60 | ||
61 | #define EX_ST(x) x | |
62 | #define EX_RETVAL(x) x | |
63 | #define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P | |
64 | #define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P | |
65 | ||
66 | #if IS_IN (libc) | |
67 | ||
68 | .text | |
69 | ||
70 | ENTRY(__memmove_niagara7) | |
71 | /* %o0=dst, %o1=src, %o2=len */ | |
72 | cmp %o1, %o0 /* if from address is >= to use forward copy */ | |
73 | bgeu,pn %XCC, .Lforcpy /* else use backward if ... */ | |
74 | sub %o0, %o1, %o4 /* get difference of two addresses */ | |
75 | cmp %o2, %o4 /* compare size and difference of addresses */ | |
76 | bleu,pn %XCC, .Lforcpy /* if size is bigger, do overlapped copy */ | |
77 | add %o1, %o2, %o5 /* get to end of source space */ | |
78 | ||
79 | /* an overlapped copy that must be done "backwards" */ | |
80 | .Lchksize: | |
81 | cmp %o2, 8 /* less than 8 byte do byte copy */ | |
82 | blu,pn %XCC, 2f /* else continue */ | |
83 | ||
84 | /* Now size is bigger than 8 */ | |
85 | .Ldbalign: | |
86 | add %o0, %o2, %g1 /* get to end of dest space */ | |
87 | andcc %g1, 7, %o3 /* %o3 has cnt til dst 8 byte align */ | |
88 | bz,a,pn %XCC, .Ldbbck /* skip if dst is 8 byte aligned */ | |
89 | andn %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ | |
90 | sub %o2, %o3, %o2 /* update o2 with new count */ | |
91 | ||
92 | 1: dec %o5 /* decrement source */ | |
93 | ldub [%o5], %g1 /* load one byte */ | |
94 | deccc %o3 /* decrement count */ | |
95 | bgu,pt %XCC, 1b /* if not done keep copying */ | |
96 | stb %g1, [%o5+%o4] /* store one byte into dest */ | |
97 | andncc %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ | |
98 | bz,pn %XCC, 2f /* if size < 8, move to byte copy */ | |
99 | ||
100 | /* Now Destination is 8 byte aligned */ | |
101 | .Ldbbck: | |
102 | andcc %o5, 7, %o0 /* %o0 has src offset */ | |
103 | bz,a,pn %XCC, .Ldbcopybc /* if src is aligned do fast memmove */ | |
104 | sub %o2, %o3, %o2 /* Residue bytes in %o2 */ | |
105 | ||
106 | .Lcpy_dbwdbc: /* alignment of src is needed */ | |
107 | sub %o2, 8, %o2 /* set size one loop ahead */ | |
108 | sll %o0, 3, %g1 /* %g1 is left shift */ | |
109 | mov 64, %g5 /* init %g5 to be 64 */ | |
110 | sub %g5, %g1, %g5 /* %g5 rightshift = (64 - leftshift) */ | |
111 | sub %o5, %o0, %o5 /* align the src at 8 bytes. */ | |
112 | add %o4, %o0, %o4 /* increase diff between src & dst */ | |
113 | ldx [%o5], %o1 /* load first 8 bytes */ | |
114 | srlx %o1, %g5, %o1 | |
115 | 1: sub %o5, 8, %o5 /* subtract 8 from src */ | |
116 | ldx [%o5], %o0 /* load 8 byte */ | |
117 | sllx %o0, %g1, %o3 /* shift loaded val left to tmp reg */ | |
118 | or %o1, %o3, %o3 /* align data */ | |
119 | stx %o3, [%o5+%o4] /* store 8 byte */ | |
120 | subcc %o2, 8, %o2 /* subtract 8 byte from size */ | |
121 | bg,pt %XCC, 1b /* if size > 0 continue */ | |
122 | srlx %o0, %g5, %o1 /* move extra byte for the next use */ | |
123 | ||
124 | srl %g1, 3, %o0 /* restore %o0 value for alignment */ | |
125 | add %o5, %o0, %o5 /* restore src alignment */ | |
126 | sub %o4, %o0, %o4 /* restore diff between src & dest */ | |
127 | ||
128 | ba 2f /* branch to the trailing byte copy */ | |
129 | add %o2, 8, %o2 /* restore size value */ | |
130 | ||
131 | .Ldbcopybc: /* alignment of src is not needed */ | |
132 | 1: sub %o5, 8, %o5 /* subtract from src */ | |
133 | ldx [%o5], %g1 /* load 8 bytes */ | |
134 | subcc %o3, 8, %o3 /* subtract from size */ | |
135 | bgu,pt %XCC, 1b /* if size is bigger 0 continue */ | |
136 | stx %g1, [%o5+%o4] /* store 8 bytes to destination */ | |
137 | ||
138 | ba 2f | |
139 | nop | |
140 | ||
141 | .Lbcbyte: | |
142 | 1: ldub [%o5], %g1 /* load one byte */ | |
143 | stb %g1, [%o5+%o4] /* store one byte */ | |
144 | 2: deccc %o2 /* decrement size */ | |
145 | bgeu,a,pt %XCC, 1b /* if size is >= 0 continue */ | |
146 | dec %o5 /* decrement from address */ | |
147 | ||
148 | .Lexitbc: /* exit from backward copy */ | |
149 | retl | |
150 | add %o5, %o4, %o0 /* restore dest addr */ | |
151 | ||
152 | ||
153 | /* Check to see if memmove is large aligned copy | |
154 | * If so, use special version of copy that avoids | |
155 | * use of block store init. */ | |
156 | .Lforcpy: | |
157 | cmp %o2, SMALL_MAX /* check for not small case */ | |
158 | blt,pn %XCC, .Lmv_short /* merge with memcpy */ | |
159 | mov %o0, %g1 /* save %o0 */ | |
160 | neg %o0, %o5 | |
161 | andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ | |
162 | brz,pt %o5, .Lmv_dst_aligned_on_8 | |
163 | ||
164 | /* %o5 has the bytes to be written in partial store. */ | |
165 | sub %o2, %o5, %o2 | |
166 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
167 | 7: /* dst aligning loop */ | |
168 | ldub [%o1+%o0], %o4 /* load one byte */ | |
169 | subcc %o5, 1, %o5 | |
170 | stb %o4, [%o0] | |
171 | bgu,pt %XCC, 7b | |
172 | add %o0, 1, %o0 /* advance dst */ | |
173 | add %o1, %o0, %o1 /* restore %o1 */ | |
174 | .Lmv_dst_aligned_on_8: | |
175 | andcc %o1, 7, %o5 | |
176 | brnz,pn %o5, .Lsrc_dst_unaligned_on_8 | |
177 | prefetch [%o1 + (1 * BLOCK_SIZE)], 20 | |
178 | ||
179 | .Lmv_src_dst_aligned_on_8: | |
180 | /* check if we are copying MED_MAX or more bytes */ | |
181 | cmp %o2, MED_MAX /* limit to store buffer size */ | |
182 | bleu,pt %XCC, .Lmedlong | |
183 | prefetch [%o1 + (2 * BLOCK_SIZE)], 20 | |
184 | ||
185 | /* The mv_align loop below mimics the memcpy code for large aligned copies, | |
186 | * but does not use the ASI_STBI_P (block initializing store) performance | |
187 | * optimization. This is used when memcpy is incorrectly invoked with | |
188 | * overlapping buffers. */ | |
189 | ||
190 | .Lmv_large_align8_copy: /* Src and dst share 8 byte align */ | |
191 | /* align dst to 64 byte boundary */ | |
192 | andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ | |
193 | brz,pn %o3, .Lmv_aligned_on_64 | |
194 | sub %o3, 64, %o3 /* %o3 has negative bytes to move */ | |
195 | add %o2, %o3, %o2 /* adjust remaining count */ | |
196 | .Lmv_align_to_64: | |
197 | ldx [%o1], %o4 | |
198 | add %o1, 8, %o1 /* increment src ptr */ | |
199 | addcc %o3, 8, %o3 | |
200 | stx %o4, [%o0] | |
201 | brnz,pt %o3, .Lmv_align_to_64 | |
202 | add %o0, 8, %o0 /* increment dst ptr */ | |
203 | ||
204 | .Lmv_aligned_on_64: | |
205 | andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ | |
206 | and %o2, 0x3f, %o2 /* residue bytes in %o2 */ | |
207 | .Lmv_align_loop: | |
208 | ldx [%o1],%o4 | |
209 | stx %o4,[%o0] | |
210 | prefetch [%o0 + (10 * BLOCK_SIZE)], 22 | |
211 | prefetch [%o1 + (10 * BLOCK_SIZE)], 21 | |
212 | subcc %o5, 64, %o5 | |
213 | ldx [%o1+8],%o4 | |
214 | stx %o4,[%o0+8] | |
215 | ldx [%o1+16],%o4 | |
216 | stx %o4,[%o0+16] | |
217 | ldx [%o1+24],%o4 | |
218 | stx %o4,[%o0+24] | |
219 | ldx [%o1+32],%o4 | |
220 | stx %o4,[%o0+32] | |
221 | ldx [%o1+40],%o4 | |
222 | stx %o4,[%o0+40] | |
223 | ldx [%o1+48],%o4 | |
224 | add %o1, 64, %o1 | |
225 | stx %o4,[%o0+48] | |
226 | add %o0, 64, %o0 | |
227 | ldx [%o1-8],%o4 | |
228 | bgt,pt %XCC, .Lmv_align_loop | |
229 | stx %o4,[%o0-8] | |
230 | ||
231 | ba .Lmedlong | |
232 | nop | |
233 | END(__memmove_niagara7) | |
234 | ||
235 | ENTRY(__mempcpy_niagara7) | |
236 | /* %o0=dst, %o1=src, %o2=len */ | |
237 | ba,pt %icc, 101f | |
238 | add %o0, %o2, %g1 /* save dst + len */ | |
239 | END(__mempcpy_niagara7) | |
240 | ||
241 | .align 32 | |
242 | ENTRY(__memcpy_niagara7) | |
243 | 100: /* %o0=dst, %o1=src, %o2=len */ | |
244 | mov %o0, %g1 /* save %o0 */ | |
245 | 101: | |
246 | #ifndef __arch64__ | |
247 | srl %o2, 0, %o2 | |
248 | #endif | |
249 | cmp %o2, SMALL_MAX /* check for not small case */ | |
250 | bgeu,pn %XCC, .Lmedium /* go to larger cases */ | |
251 | .Lmv_short: | |
252 | cmp %o2, SHORTCOPY /* check for really short case */ | |
253 | ble,pn %XCC, .Lsmallfin | |
254 | or %o0, %o1, %o4 /* prepare alignment check */ | |
255 | andcc %o4, 0x3, %o5 /* test for word alignment */ | |
256 | bnz,pn %XCC, .Lsmallunalign /* branch to non-word aligned case */ | |
257 | nop | |
258 | subcc %o2, 7, %o2 /* adjust count */ | |
259 | ble,pn %XCC, .Lsmallwordx | |
260 | andcc %o4, 0x7, %o5 /* test for long alignment */ | |
261 | /* 8 or more bytes, src and dest start on word boundary | |
262 | * %o4 contains or %o0, %o1 */ | |
263 | .Lsmalllong: | |
264 | bnz,pn %XCC, .Lsmallwords /* branch to word aligned case */ | |
265 | cmp %o2, SHORT_LONG-7 | |
266 | bge,a %XCC, .Lmedl64 /* if we branch */ | |
267 | sub %o2,56,%o2 /* adjust %o2 to -63 off count */ | |
268 | ||
269 | /* slightly unroll the small_long_loop to improve very short copies */ | |
270 | cmp %o2, 32-7 | |
271 | blt,a,pn %XCC, .Lsmall_long_l | |
272 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
273 | ||
274 | ldx [%o1], %o5 | |
275 | ldx [%o1+8], %o4 | |
276 | ldx [%o1+16], %o3 | |
277 | ||
278 | subcc %o2, 24, %o2 | |
279 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
280 | ||
281 | stx %o5, [%o0] /* write word */ | |
282 | stx %o4, [%o0+8] /* write word */ | |
283 | stx %o3, [%o0+16] /* write word */ | |
284 | ||
285 | add %o0, 24, %o0 | |
286 | ||
287 | /* end loop unroll */ | |
288 | ||
289 | .Lsmall_long_l: | |
290 | ldx [%o1+%o0], %o3 | |
291 | subcc %o2, 8, %o2 | |
292 | add %o0, 8, %o0 | |
293 | bgu,pn %XCC, .Lsmall_long_l /* loop until done */ | |
294 | stx %o3, [%o0-8] /* write word */ | |
295 | addcc %o2, 7, %o2 /* restore %o2 to correct count */ | |
296 | bnz,pn %XCC, .Lsmall_long_x /* check for completion */ | |
297 | add %o1, %o0, %o1 /* restore %o1 */ | |
298 | retl | |
299 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
300 | .Lsmall_long_x: | |
301 | cmp %o2, 4 /* check for 4 or more bytes left */ | |
302 | blt,pn %XCC, .Lsmallleft3 /* if not, go to finish up */ | |
303 | nop | |
304 | lduw [%o1], %o3 | |
305 | add %o1, 4, %o1 | |
306 | subcc %o2, 4, %o2 | |
307 | stw %o3, [%o0] | |
308 | bnz,pn %XCC, .Lsmallleft3 | |
309 | add %o0, 4, %o0 | |
310 | retl | |
311 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
312 | ||
313 | .align 32 | |
314 | /* src and dest start on word boundary; 7 or fewer bytes */ | |
315 | .Lsmallwordx: | |
316 | lduw [%o1], %o3 /* read word */ | |
317 | addcc %o2, 3, %o2 /* restore count */ | |
318 | bz,pt %XCC, .Lsmallexit | |
319 | stw %o3, [%o0] /* write word */ | |
320 | deccc %o2 /* reduce count for cc test */ | |
321 | ldub [%o1+4], %o3 /* load one byte */ | |
322 | bz,pt %XCC, .Lsmallexit | |
323 | stb %o3, [%o0+4] /* store one byte */ | |
324 | ldub [%o1+5], %o3 /* load second byte */ | |
325 | deccc %o2 | |
326 | bz,pt %XCC, .Lsmallexit | |
327 | stb %o3, [%o0+5] /* store second byte */ | |
328 | ldub [%o1+6], %o3 /* load third byte */ | |
329 | stb %o3, [%o0+6] /* store third byte */ | |
330 | .Lsmallexit: | |
331 | retl | |
332 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
333 | ||
334 | .align 32 | |
335 | .Lsmallunalign: | |
336 | cmp %o2, SHORTCHECK | |
337 | ble,pn %XCC, .Lsmallrest | |
338 | cmp %o2, SMALL_UMAX | |
339 | bge,pt %XCC, .Lmedium_join | |
340 | andcc %o1, 0x3, %o5 /* is src word aligned */ | |
341 | bz,pn %XCC, .Laldst | |
342 | cmp %o5, 2 /* is src half-word aligned */ | |
343 | be,pt %XCC, .Ls2algn | |
344 | cmp %o5, 3 /* src is byte aligned */ | |
345 | .Ls1algn: | |
346 | ldub [%o1], %o3 /* move 1 or 3 bytes to align it */ | |
347 | inc 1, %o1 | |
348 | stb %o3, [%o0] /* move a byte to align src */ | |
349 | inc 1, %o0 | |
350 | bne,pt %XCC, .Ls2algn | |
351 | dec %o2 | |
352 | b .Lald /* now go align dest */ | |
353 | andcc %o0, 0x3, %o5 | |
354 | ||
355 | .Ls2algn: | |
356 | lduh [%o1], %o3 /* know src is 2 byte aligned */ | |
357 | inc 2, %o1 | |
358 | srl %o3, 8, %o4 | |
359 | stb %o4, [%o0] /* have to do bytes, */ | |
360 | stb %o3, [%o0 + 1] /* do not know dst alignment */ | |
361 | inc 2, %o0 | |
362 | dec 2, %o2 | |
363 | ||
364 | .Laldst: | |
365 | andcc %o0, 0x3, %o5 /* align the destination address */ | |
366 | .Lald: | |
367 | bz,pn %XCC, .Lw4cp | |
368 | cmp %o5, 2 | |
369 | be,pn %XCC, .Lw2cp | |
370 | cmp %o5, 3 | |
371 | .Lw3cp: lduw [%o1], %o4 | |
372 | inc 4, %o1 | |
373 | srl %o4, 24, %o5 | |
374 | stb %o5, [%o0] | |
375 | bne,pt %XCC, .Lw1cp | |
376 | inc %o0 | |
377 | dec 1, %o2 | |
378 | andn %o2, 3, %o3 /* %o3 is aligned word count */ | |
379 | dec 4, %o3 /* avoid reading beyond tail of src */ | |
380 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
381 | ||
382 | 1: sll %o4, 8, %g5 /* save residual bytes */ | |
383 | lduw [%o1+%o0], %o4 | |
384 | deccc 4, %o3 | |
385 | srl %o4, 24, %o5 /* merge with residual */ | |
386 | or %o5, %g5, %g5 | |
387 | st %g5, [%o0] | |
388 | bnz,pt %XCC, 1b | |
389 | inc 4, %o0 | |
390 | sub %o1, 3, %o1 /* used one byte of last word read */ | |
391 | and %o2, 3, %o2 | |
392 | b 7f | |
393 | inc 4, %o2 | |
394 | ||
395 | .Lw1cp: srl %o4, 8, %o5 | |
396 | sth %o5, [%o0] | |
397 | inc 2, %o0 | |
398 | dec 3, %o2 | |
399 | andn %o2, 3, %o3 /* %o3 is aligned word count */ | |
400 | dec 4, %o3 /* avoid reading beyond tail of src */ | |
401 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
402 | ||
403 | 2: sll %o4, 24, %g5 /* save residual bytes */ | |
404 | lduw [%o1+%o0], %o4 | |
405 | deccc 4, %o3 | |
406 | srl %o4, 8, %o5 /* merge with residual */ | |
407 | or %o5, %g5, %g5 | |
408 | st %g5, [%o0] | |
409 | bnz,pt %XCC, 2b | |
410 | inc 4, %o0 | |
411 | sub %o1, 1, %o1 /* used 3 bytes of last word read */ | |
412 | and %o2, 3, %o2 | |
413 | b 7f | |
414 | inc 4, %o2 | |
415 | ||
416 | .Lw2cp: lduw [%o1], %o4 | |
417 | inc 4, %o1 | |
418 | srl %o4, 16, %o5 | |
419 | sth %o5, [%o0] | |
420 | inc 2, %o0 | |
421 | dec 2, %o2 | |
422 | andn %o2, 3, %o3 /* %o3 is aligned word count */ | |
423 | dec 4, %o3 /* avoid reading beyond tail of src */ | |
424 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
425 | ||
426 | 3: sll %o4, 16, %g5 /* save residual bytes */ | |
427 | lduw [%o1+%o0], %o4 | |
428 | deccc 4, %o3 | |
429 | srl %o4, 16, %o5 /* merge with residual */ | |
430 | or %o5, %g5, %g5 | |
431 | st %g5, [%o0] | |
432 | bnz,pt %XCC, 3b | |
433 | inc 4, %o0 | |
434 | sub %o1, 2, %o1 /* used two bytes of last word read */ | |
435 | and %o2, 3, %o2 | |
436 | b 7f | |
437 | inc 4, %o2 | |
438 | ||
439 | .Lw4cp: andn %o2, 3, %o3 /* %o3 is aligned word count */ | |
440 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
441 | ||
442 | 1: lduw [%o1+%o0], %o4 /* read from address */ | |
443 | deccc 4, %o3 /* decrement count */ | |
444 | st %o4, [%o0] /* write at destination address */ | |
445 | bgu,pt %XCC, 1b | |
446 | inc 4, %o0 /* increment to address */ | |
447 | and %o2, 3, %o2 /* number of leftover bytes, if any */ | |
448 | ||
449 | /* simple finish up byte copy, works with any alignment */ | |
450 | 7: | |
451 | add %o1, %o0, %o1 /* restore %o1 */ | |
452 | .Lsmallrest: | |
453 | tst %o2 | |
454 | bz,pt %XCC, .Lsmallx | |
455 | cmp %o2, 4 | |
456 | blt,pn %XCC, .Lsmallleft3 | |
457 | nop | |
458 | sub %o2, 3, %o2 | |
459 | .Lsmallnotalign4: | |
460 | ldub [%o1], %o3 /* read byte */ | |
461 | subcc %o2, 4, %o2 /* reduce count by 4 */ | |
462 | stb %o3, [%o0] /* write byte */ | |
463 | ldub [%o1+1], %o3 /* repeat for total of 4 bytes */ | |
464 | add %o1, 4, %o1 /* advance SRC by 4 */ | |
465 | stb %o3, [%o0+1] | |
466 | ldub [%o1-2], %o3 | |
467 | add %o0, 4, %o0 /* advance DST by 4 */ | |
468 | stb %o3, [%o0-2] | |
469 | ldub [%o1-1], %o3 | |
470 | bgu,pt %XCC, .Lsmallnotalign4 /* loop til 3 or fewer bytes remain */ | |
471 | stb %o3, [%o0-1] | |
472 | addcc %o2, 3, %o2 /* restore count */ | |
473 | bz,pt %XCC, .Lsmallx | |
474 | .Lsmallleft3: /* 1, 2, or 3 bytes remain */ | |
475 | subcc %o2, 1, %o2 | |
476 | ldub [%o1], %o3 /* load one byte */ | |
477 | bz,pt %XCC, .Lsmallx | |
478 | stb %o3, [%o0] /* store one byte */ | |
479 | ldub [%o1+1], %o3 /* load second byte */ | |
480 | subcc %o2, 1, %o2 | |
481 | bz,pt %XCC, .Lsmallx | |
482 | stb %o3, [%o0+1] /* store second byte */ | |
483 | ldub [%o1+2], %o3 /* load third byte */ | |
484 | stb %o3, [%o0+2] /* store third byte */ | |
485 | .Lsmallx: | |
486 | retl | |
487 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
488 | ||
489 | .Lsmallfin: | |
490 | tst %o2 | |
491 | bnz,pn %XCC, .Lsmallleft3 | |
492 | nop | |
493 | retl | |
494 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
495 | ||
496 | .align 16 | |
497 | .Lsmallwords: | |
498 | lduw [%o1], %o3 /* read word */ | |
499 | subcc %o2, 8, %o2 /* update count */ | |
500 | stw %o3, [%o0] /* write word */ | |
501 | add %o1, 8, %o1 /* update SRC */ | |
502 | lduw [%o1-4], %o3 /* read word */ | |
503 | add %o0, 8, %o0 /* update DST */ | |
504 | bgu,pt %XCC, .Lsmallwords /* loop until done */ | |
505 | stw %o3, [%o0-4] /* write word */ | |
506 | addcc %o2, 7, %o2 /* restore count */ | |
507 | bz,pt %XCC, .Lsmallexit /* check for completion */ | |
508 | cmp %o2, 4 /* check for 4 or more bytes left */ | |
509 | blt,pt %XCC, .Lsmallleft3 /* if not, go to finish up */ | |
510 | nop | |
511 | lduw [%o1], %o3 | |
512 | add %o1, 4, %o1 | |
513 | subcc %o2, 4, %o2 | |
514 | add %o0, 4, %o0 | |
515 | bnz,pn %XCC, .Lsmallleft3 | |
516 | stw %o3, [%o0-4] | |
517 | retl | |
518 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
519 | ||
520 | .align 16 | |
521 | .Lmedium: | |
522 | .Lmedium_join: | |
523 | neg %o0, %o5 | |
524 | andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ | |
525 | brz,pt %o5, .Ldst_aligned_on_8 | |
526 | ||
527 | /* %o5 has the bytes to be written in partial store. */ | |
528 | sub %o2, %o5, %o2 | |
529 | sub %o1, %o0, %o1 /* %o1 gets the difference */ | |
530 | 7: /* dst aligning loop */ | |
531 | ldub [%o1+%o0], %o4 /* load one byte */ | |
532 | subcc %o5, 1, %o5 | |
533 | stb %o4, [%o0] | |
534 | bgu,pt %XCC, 7b | |
535 | add %o0, 1, %o0 /* advance dst */ | |
536 | add %o1, %o0, %o1 /* restore %o1 */ | |
537 | .Ldst_aligned_on_8: | |
538 | andcc %o1, 7, %o5 | |
539 | brnz,pt %o5, .Lsrc_dst_unaligned_on_8 | |
540 | nop | |
541 | ||
542 | .Lsrc_dst_aligned_on_8: | |
543 | /* check if we are copying MED_MAX or more bytes */ | |
544 | cmp %o2, MED_MAX /* limit to store buffer size */ | |
545 | bgu,pn %XCC, .Llarge_align8_copy | |
546 | nop | |
547 | /* | |
548 | * Special case for handling when src and dest are both long word aligned | |
549 | * and total data to move is less than MED_MAX bytes | |
550 | */ | |
551 | .Lmedlong: | |
552 | subcc %o2, 63, %o2 /* adjust length to allow cc test */ | |
553 | ble,pn %XCC, .Lmedl63 /* skip big loop if < 64 bytes */ | |
554 | nop | |
555 | .Lmedl64: | |
556 | ldx [%o1], %o4 /* load */ | |
557 | subcc %o2, 64, %o2 /* decrement length count */ | |
558 | stx %o4, [%o0] /* and store */ | |
559 | ldx [%o1+8], %o3 /* a block of 64 bytes */ | |
560 | stx %o3, [%o0+8] | |
561 | ldx [%o1+16], %o4 | |
562 | stx %o4, [%o0+16] | |
563 | ldx [%o1+24], %o3 | |
564 | stx %o3, [%o0+24] | |
565 | ldx [%o1+32], %o4 /* load */ | |
566 | stx %o4, [%o0+32] /* and store */ | |
567 | ldx [%o1+40], %o3 /* a block of 64 bytes */ | |
568 | add %o1, 64, %o1 /* increase src ptr by 64 */ | |
569 | stx %o3, [%o0+40] | |
570 | ldx [%o1-16], %o4 | |
571 | add %o0, 64, %o0 /* increase dst ptr by 64 */ | |
572 | stx %o4, [%o0-16] | |
573 | ldx [%o1-8], %o3 | |
574 | bgu,pt %XCC, .Lmedl64 /* repeat if at least 64 bytes left */ | |
575 | stx %o3, [%o0-8] | |
576 | .Lmedl63: | |
577 | addcc %o2, 32, %o2 /* adjust remaining count */ | |
578 | ble,pt %XCC, .Lmedl31 /* to skip if 31 or fewer bytes left */ | |
579 | nop | |
580 | ldx [%o1], %o4 /* load */ | |
581 | sub %o2, 32, %o2 /* decrement length count */ | |
582 | stx %o4, [%o0] /* and store */ | |
583 | ldx [%o1+8], %o3 /* a block of 32 bytes */ | |
584 | add %o1, 32, %o1 /* increase src ptr by 32 */ | |
585 | stx %o3, [%o0+8] | |
586 | ldx [%o1-16], %o4 | |
587 | add %o0, 32, %o0 /* increase dst ptr by 32 */ | |
588 | stx %o4, [%o0-16] | |
589 | ldx [%o1-8], %o3 | |
590 | stx %o3, [%o0-8] | |
591 | .Lmedl31: | |
592 | addcc %o2, 16, %o2 /* adjust remaining count */ | |
593 | ble,pt %XCC, .Lmedl15 /* skip if 15 or fewer bytes left */ | |
594 | nop | |
595 | ldx [%o1], %o4 /* load and store 16 bytes */ | |
596 | add %o1, 16, %o1 /* increase src ptr by 16 */ | |
597 | stx %o4, [%o0] | |
598 | sub %o2, 16, %o2 /* decrease count by 16 */ | |
599 | ldx [%o1-8], %o3 | |
600 | add %o0, 16, %o0 /* increase dst ptr by 16 */ | |
601 | stx %o3, [%o0-8] | |
602 | .Lmedl15: | |
603 | addcc %o2, 15, %o2 /* restore count */ | |
604 | bz,pt %XCC, .Lsmallexit /* exit if finished */ | |
605 | cmp %o2, 8 | |
606 | blt,pt %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ | |
607 | tst %o2 | |
608 | ldx [%o1], %o4 /* load 8 bytes */ | |
609 | add %o1, 8, %o1 /* increase src ptr by 8 */ | |
610 | add %o0, 8, %o0 /* increase dst ptr by 8 */ | |
611 | subcc %o2, 8, %o2 /* decrease count by 8 */ | |
612 | bnz,pn %XCC, .Lmedw7 | |
613 | stx %o4, [%o0-8] /* and store 8 bytes */ | |
614 | retl | |
615 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
616 | ||
617 | .align 16 | |
618 | .Lsrc_dst_unaligned_on_8: | |
619 | /* DST is 8-byte aligned, src is not */ | |
620 | andcc %o1, 0x3, %o5 /* test word alignment */ | |
621 | bnz,pt %XCC, .Lunalignsetup /* branch if not word aligned */ | |
622 | nop | |
623 | ||
624 | /* | |
625 | * Handle all cases where src and dest are aligned on word | |
626 | * boundaries. Use unrolled loops for better performance. | |
627 | * This option wins over standard large data move when | |
628 | * source and destination is in cache for medium | |
629 | * to short data moves. | |
630 | */ | |
631 | cmp %o2, MED_WMAX /* limit to store buffer size */ | |
632 | bge,pt %XCC, .Lunalignrejoin /* otherwise rejoin main loop */ | |
633 | nop | |
634 | ||
635 | subcc %o2, 31, %o2 /* adjust length to allow cc test */ | |
636 | /* for end of loop */ | |
637 | ble,pt %XCC, .Lmedw31 /* skip big loop if less than 16 */ | |
638 | .Lmedw32: | |
639 | ld [%o1], %o4 /* move a block of 32 bytes */ | |
640 | sllx %o4, 32, %o5 | |
641 | ld [%o1+4], %o4 | |
642 | or %o4, %o5, %o5 | |
643 | stx %o5, [%o0] | |
644 | subcc %o2, 32, %o2 /* decrement length count */ | |
645 | ld [%o1+8], %o4 | |
646 | sllx %o4, 32, %o5 | |
647 | ld [%o1+12], %o4 | |
648 | or %o4, %o5, %o5 | |
649 | stx %o5, [%o0+8] | |
650 | add %o1, 32, %o1 /* increase src ptr by 32 */ | |
651 | ld [%o1-16], %o4 | |
652 | sllx %o4, 32, %o5 | |
653 | ld [%o1-12], %o4 | |
654 | or %o4, %o5, %o5 | |
655 | stx %o5, [%o0+16] | |
656 | add %o0, 32, %o0 /* increase dst ptr by 32 */ | |
657 | ld [%o1-8], %o4 | |
658 | sllx %o4, 32, %o5 | |
659 | ld [%o1-4], %o4 | |
660 | or %o4, %o5, %o5 | |
661 | bgu,pt %XCC, .Lmedw32 /* repeat if at least 32 bytes left */ | |
662 | stx %o5, [%o0-8] | |
663 | .Lmedw31: | |
664 | addcc %o2, 31, %o2 /* restore count */ | |
665 | bz,pt %XCC, .Lsmallexit /* exit if finished */ | |
666 | cmp %o2, 16 | |
667 | blt,pt %XCC, .Lmedw15 | |
668 | nop | |
669 | ld [%o1], %o4 /* move a block of 16 bytes */ | |
670 | sllx %o4, 32, %o5 | |
671 | subcc %o2, 16, %o2 /* decrement length count */ | |
672 | ld [%o1+4], %o4 | |
673 | or %o4, %o5, %o5 | |
674 | stx %o5, [%o0] | |
675 | add %o1, 16, %o1 /* increase src ptr by 16 */ | |
676 | ld [%o1-8], %o4 | |
677 | add %o0, 16, %o0 /* increase dst ptr by 16 */ | |
678 | sllx %o4, 32, %o5 | |
679 | ld [%o1-4], %o4 | |
680 | or %o4, %o5, %o5 | |
681 | stx %o5, [%o0-8] | |
682 | .Lmedw15: | |
683 | bz,pt %XCC, .Lsmallexit /* exit if finished */ | |
684 | cmp %o2, 8 | |
685 | blt,pn %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ | |
686 | tst %o2 | |
687 | ld [%o1], %o4 /* load 4 bytes */ | |
688 | subcc %o2, 8, %o2 /* decrease count by 8 */ | |
689 | stw %o4, [%o0] /* and store 4 bytes */ | |
690 | add %o1, 8, %o1 /* increase src ptr by 8 */ | |
691 | ld [%o1-4], %o3 /* load 4 bytes */ | |
692 | add %o0, 8, %o0 /* increase dst ptr by 8 */ | |
693 | stw %o3, [%o0-4] /* and store 4 bytes */ | |
694 | bz,pt %XCC, .Lsmallexit /* exit if finished */ | |
695 | .Lmedw7: /* count is ge 1, less than 8 */ | |
696 | cmp %o2, 4 /* check for 4 bytes left */ | |
697 | blt,pn %XCC, .Lsmallleft3 /* skip if 3 or fewer bytes left */ | |
698 | nop | |
699 | ld [%o1], %o4 /* load 4 bytes */ | |
700 | add %o1, 4, %o1 /* increase src ptr by 4 */ | |
701 | add %o0, 4, %o0 /* increase dst ptr by 4 */ | |
702 | subcc %o2, 4, %o2 /* decrease count by 4 */ | |
703 | bnz,pt %XCC, .Lsmallleft3 | |
704 | stw %o4, [%o0-4] /* and store 4 bytes */ | |
705 | retl | |
706 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
707 | ||
708 | .align 16 | |
709 | .Llarge_align8_copy: /* Src and dst 8 byte aligned */ | |
710 | /* align dst to 64 byte boundary */ | |
711 | andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ | |
712 | brz,pn %o3, .Laligned_to_64 | |
713 | andcc %o0, 8, %o3 /* odd long words to move? */ | |
714 | brz,pt %o3, .Laligned_to_16 | |
715 | nop | |
716 | ldx [%o1], %o4 | |
717 | sub %o2, 8, %o2 | |
718 | add %o1, 8, %o1 /* increment src ptr */ | |
719 | add %o0, 8, %o0 /* increment dst ptr */ | |
720 | stx %o4, [%o0-8] | |
721 | .Laligned_to_16: | |
722 | andcc %o0, 16, %o3 /* pair of long words to move? */ | |
723 | brz,pt %o3, .Laligned_to_32 | |
724 | nop | |
725 | ldx [%o1], %o4 | |
726 | sub %o2, 16, %o2 | |
727 | stx %o4, [%o0] | |
728 | add %o1, 16, %o1 /* increment src ptr */ | |
729 | ldx [%o1-8], %o4 | |
730 | add %o0, 16, %o0 /* increment dst ptr */ | |
731 | stx %o4, [%o0-8] | |
732 | .Laligned_to_32: | |
733 | andcc %o0, 32, %o3 /* four long words to move? */ | |
734 | brz,pt %o3, .Laligned_to_64 | |
735 | nop | |
736 | ldx [%o1], %o4 | |
737 | sub %o2, 32, %o2 | |
738 | stx %o4, [%o0] | |
739 | ldx [%o1+8], %o4 | |
740 | stx %o4, [%o0+8] | |
741 | ldx [%o1+16], %o4 | |
742 | stx %o4, [%o0+16] | |
743 | add %o1, 32, %o1 /* increment src ptr */ | |
744 | ldx [%o1-8], %o4 | |
745 | add %o0, 32, %o0 /* increment dst ptr */ | |
746 | stx %o4, [%o0-8] | |
747 | .Laligned_to_64: | |
748 | /* Following test is included to avoid issues where existing executables | |
749 | * incorrectly call memcpy with overlapping src and dest instead of memmove | |
750 | * | |
751 | * if ( (src ge dst) and (dst+len > src)) go to overlap case | |
752 | * if ( (src lt dst) and (src+len > dst)) go to overlap case | |
753 | */ | |
754 | cmp %o1,%o0 | |
755 | bge,pt %XCC, 1f | |
756 | nop | |
757 | /* src+len > dst? */ | |
758 | add %o1, %o2, %o4 | |
759 | cmp %o4, %o0 | |
760 | bgt,pt %XCC, .Lmv_aligned_on_64 | |
761 | nop | |
762 | ba 2f | |
763 | nop | |
764 | 1: | |
765 | /* dst+len > src? */ | |
766 | add %o0, %o2, %o4 | |
767 | cmp %o4, %o1 | |
768 | bgt,pt %XCC, .Lmv_aligned_on_64 | |
769 | nop | |
770 | 2: | |
771 | /* handle non-overlapped copies | |
772 | * | |
773 | * Using block init store (BIS) instructions to avoid fetching cache | |
774 | * lines from memory. Use ST_CHUNK stores to first element of each cache | |
775 | * line (similar to prefetching) to avoid overfilling STQ or miss buffers. | |
776 | * Gives existing cache lines time to be moved out of L1/L2/L3 cache. | |
777 | */ | |
778 | andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ | |
779 | and %o2, 0x3f, %o2 /* residue bytes in %o2 */ | |
780 | ||
781 | /* We use ASI_STBIMRU_P for the first store to each cache line | |
782 | * followed by ASI_STBI_P (mark as LRU) for the last store. That | |
783 | * mixed approach reduces the chances the cache line is removed | |
784 | * before we finish setting it, while minimizing the effects on | |
785 | * other cached values during a large memcpy | |
786 | * | |
787 | * Intermediate stores can be normal since first BIS activates the | |
788 | * cache line in the L2 cache. | |
789 | * | |
790 | * ST_CHUNK batches up initial BIS operations for several cache lines | |
791 | * to allow multiple requests to not be blocked by overflowing the | |
792 | * the store miss buffer. Then the matching stores for all those | |
793 | * BIS operations are executed. | |
794 | */ | |
795 | ||
796 | .Lalign_loop: | |
797 | cmp %o5, ST_CHUNK*64 | |
798 | blu,pt %XCC, .Lalign_short | |
799 | mov ST_CHUNK, %o3 | |
800 | sllx %o3, 6, %g5 /* ST_CHUNK*64 */ | |
801 | ||
802 | .Lalign_loop_start: | |
803 | prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 | |
804 | subcc %o3, 2, %o3 | |
805 | ldx [%o1], %o4 | |
806 | add %o1, 128, %o1 | |
807 | EX_ST(STORE_ASI(%o4, %o0)) | |
808 | add %o0, 64, %o0 | |
809 | ldx [%o1-64], %o4 | |
810 | EX_ST(STORE_ASI(%o4, %o0)) | |
811 | add %o0, 64, %o0 | |
812 | bgu,pt %XCC, .Lalign_loop_start | |
813 | prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21 | |
814 | ||
815 | mov ST_CHUNK, %o3 | |
816 | sub %o1, %g5, %o1 /* reset %o1 */ | |
817 | sub %o0, %g5, %o0 /* reset %o0 */ | |
818 | ||
819 | sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ | |
820 | .Lalign_loop_rest: | |
821 | ldx [%o1+8],%o4 | |
822 | add %o0, 64, %o0 | |
823 | stx %o4, [%o0-48] | |
824 | subcc %o3, 1, %o3 | |
825 | ldx [%o1+16],%o4 | |
826 | stx %o4, [%o0-40] | |
827 | sub %o5, 64, %o5 | |
828 | ldx [%o1+24],%o4 | |
829 | stx %o4, [%o0-32] | |
830 | ldx [%o1+32],%o4 | |
831 | stx %o4, [%o0-24] | |
832 | ldx [%o1+40],%o4 | |
833 | stx %o4, [%o0-16] | |
834 | ldx [%o1+48],%o4 | |
835 | stx %o4, [%o0-8] | |
836 | add %o1, 64, %o1 | |
837 | ldx [%o1-8],%o4 | |
838 | bgu,pt %XCC, .Lalign_loop_rest | |
839 | EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ | |
840 | ||
841 | mov ST_CHUNK, %o3 | |
842 | cmp %o5, ST_CHUNK*64 | |
843 | bgu,pt %XCC, .Lalign_loop_start | |
844 | add %o0, 8, %o0 /* restore %o0 from ASI alignment */ | |
845 | ||
846 | cmp %o5, 0 | |
847 | beq,pt %XCC, .Lalign_done | |
848 | ||
849 | /* no prefetches needed in these loops | |
850 | * since we are within ALIGN_PRE of the end */ | |
851 | .Lalign_short: | |
852 | srl %o5, 6, %o3 | |
853 | .Lalign_loop_short: | |
854 | subcc %o3, 1, %o3 | |
855 | ldx [%o1], %o4 | |
856 | add %o1, 64, %o1 | |
857 | EX_ST(STORE_ASI(%o4, %o0)) | |
858 | bgu,pt %XCC, .Lalign_loop_short | |
859 | add %o0, 64, %o0 | |
860 | ||
861 | sub %o1, %o5, %o1 /* reset %o1 */ | |
862 | sub %o0, %o5, %o0 /* reset %o0 */ | |
863 | ||
864 | sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ | |
865 | .Lalign_short_rest: | |
866 | ldx [%o1+8],%o4 | |
867 | add %o0, 64, %o0 | |
868 | stx %o4, [%o0-48] | |
869 | ldx [%o1+16],%o4 | |
870 | subcc %o5, 64, %o5 | |
871 | stx %o4, [%o0-40] | |
872 | ldx [%o1+24],%o4 | |
873 | stx %o4, [%o0-32] | |
874 | ldx [%o1+32],%o4 | |
875 | stx %o4, [%o0-24] | |
876 | ldx [%o1+40],%o4 | |
877 | stx %o4, [%o0-16] | |
878 | ldx [%o1+48],%o4 | |
879 | stx %o4, [%o0-8] | |
880 | add %o1, 64, %o1 | |
881 | ldx [%o1-8],%o4 | |
882 | bgu,pt %XCC, .Lalign_short_rest | |
883 | EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ | |
884 | ||
885 | add %o0, 8, %o0 /* restore %o0 from ASI alignment */ | |
886 | ||
887 | .Lalign_done: | |
888 | cmp %o2, 0 | |
889 | membar #StoreStore | |
890 | bne,pt %XCC, .Lmedl63 | |
891 | subcc %o2, 63, %o2 /* adjust length to allow cc test */ | |
892 | retl | |
893 | mov EX_RETVAL(%g1), %o0 /* restore %o0 */ | |
894 | ||
895 | .align 16 | |
896 | /* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX */ | |
897 | /* Since block load/store and BIS are not in use for unaligned data, | |
898 | * no need to align dst on 64 byte cache line boundary */ | |
899 | .Lunalignsetup: | |
900 | .Lunalignrejoin: | |
901 | rd %fprs, %g5 /* check for unused fp */ | |
902 | /* if fprs.fef == 0, set it. | |
903 | * Setting it when already set costs more than checking */ | |
904 | andcc %g5, FPRS_FEF, %g5 /* test FEF, fprs.du = fprs.dl = 0 */ | |
905 | bz,a %XCC, 1f | |
906 | wr %g0, FPRS_FEF, %fprs /* fprs.fef = 1 */ | |
907 | 1: | |
908 | andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ | |
909 | and %o2, 0x3f, %o2 /* residue bytes in %o2 */ | |
910 | cmp %o2, 8 /* Insure we do not load beyond */ | |
911 | bgt,pt %XCC, .Lunalign_adjust /* end of source buffer */ | |
912 | andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ | |
913 | add %o2, 64, %o2 /* adjust to leave loop */ | |
914 | sub %o5, 64, %o5 /* early if necessary */ | |
915 | .Lunalign_adjust: | |
916 | alignaddr %o1, %g0, %g0 /* generate %gsr */ | |
917 | add %o1, %o5, %o1 /* advance %o1 to after blocks */ | |
918 | ldd [%o4], %f0 | |
919 | .Lunalign_loop: | |
920 | prefetch [%o0 + (9 * BLOCK_SIZE)], 20 | |
921 | ldd [%o4+8], %f2 | |
922 | faligndata %f0, %f2, %f16 | |
923 | ldd [%o4+16], %f4 | |
924 | subcc %o5, BLOCK_SIZE, %o5 | |
925 | std %f16, [%o0] | |
926 | faligndata %f2, %f4, %f18 | |
927 | ldd [%o4+24], %f6 | |
928 | std %f18, [%o0+8] | |
929 | faligndata %f4, %f6, %f20 | |
930 | ldd [%o4+32], %f8 | |
931 | std %f20, [%o0+16] | |
932 | faligndata %f6, %f8, %f22 | |
933 | ldd [%o4+40], %f10 | |
934 | std %f22, [%o0+24] | |
935 | faligndata %f8, %f10, %f24 | |
936 | ldd [%o4+48], %f12 | |
937 | std %f24, [%o0+32] | |
938 | faligndata %f10, %f12, %f26 | |
939 | ldd [%o4+56], %f14 | |
940 | add %o4, BLOCK_SIZE, %o4 | |
941 | std %f26, [%o0+40] | |
942 | faligndata %f12, %f14, %f28 | |
943 | ldd [%o4], %f0 | |
944 | std %f28, [%o0+48] | |
945 | faligndata %f14, %f0, %f30 | |
946 | std %f30, [%o0+56] | |
947 | add %o0, BLOCK_SIZE, %o0 | |
948 | bgu,pt %XCC, .Lunalign_loop | |
949 | prefetch [%o4 + (11 * BLOCK_SIZE)], 20 | |
950 | ||
951 | /* Handle trailing bytes, 64 to 127 | |
952 | * Dest long word aligned, Src not long word aligned */ | |
953 | cmp %o2, 15 | |
954 | bleu,pt %XCC, .Lunalign_short | |
955 | ||
956 | andn %o2, 0x7, %o5 /* %o5 is multiple of 8 */ | |
957 | and %o2, 0x7, %o2 /* residue bytes in %o2 */ | |
958 | add %o2, 8, %o2 | |
959 | sub %o5, 8, %o5 /* do not load past end of src */ | |
960 | andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ | |
961 | add %o1, %o5, %o1 /* move %o1 to after multiple of 8 */ | |
962 | ldd [%o4], %f0 /* fetch partial word */ | |
963 | .Lunalign_by8: | |
964 | ldd [%o4+8], %f2 | |
965 | add %o4, 8, %o4 | |
966 | faligndata %f0, %f2, %f16 | |
967 | subcc %o5, 8, %o5 | |
968 | std %f16, [%o0] | |
969 | fsrc2 %f2, %f0 | |
970 | bgu,pt %XCC, .Lunalign_by8 | |
971 | add %o0, 8, %o0 | |
972 | ||
973 | .Lunalign_short: /* restore fprs state */ | |
974 | brnz,pt %g5, .Lsmallrest | |
975 | nop | |
976 | ba .Lsmallrest | |
977 | wr %g5, %g0, %fprs | |
978 | END(__memcpy_niagara7) | |
979 | ||
980 | #endif |