]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc32/a2/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / a2 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2015 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
23 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
24
25 .machine a2
26 EALIGN (memcpy, 5, 0)
27 CALL_MCOUNT
28
29 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
30 cmplwi cr1,r5,16 /* is size < 16 ? */
31 mr r6,r3 /* Copy dest reg to r6; */
32 blt+ cr1,L(shortcopy)
33
34
35 /* Big copy (16 bytes or more)
36
37 Figure out how far to the nearest quadword boundary, or if we are
38 on one already.
39
40 r3 - return value (always)
41 r4 - current source addr
42 r5 - copy length
43 r6 - current dest addr
44 */
45
46 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
47 clrlwi r8,r8,32-4 /* align to 16byte boundary */
48 sub r7,r4,r3 /* compute offset to src from dest */
49 cmplwi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
50 beq+ L(dst_aligned)
51
52
53
54 /* Destination is not aligned on quadword boundary. Get us to one.
55
56 r3 - return value (always)
57 r4 - current source addr
58 r5 - copy length
59 r6 - current dest addr
60 r7 - offset to src from dest
61 r8 - number of bytes to quadword boundary
62 */
63
64 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
65 subf r5,r8,r5 /* adjust remaining len */
66
67 bf cr7*4+3,1f
68 lbzx r0,r7,r6 /* copy 1 byte addr */
69 stb r0,0(r6)
70 addi r6,r6,1
71 1:
72 bf cr7*4+2,2f
73 lhzx r0,r7,r6 /* copy 2 byte addr */
74 sth r0,0(r6)
75 addi r6,r6,2
76 2:
77 bf cr7*4+1,4f
78 lwzx r0,r7,r6 /* copy 4 byte addr */
79 stw r0,0(r6)
80 addi r6,r6,4
81 4:
82 bf cr7*4+0,8f
83 lfdx r0,r7,r6 /* copy 8 byte addr */
84 stfd r0,0(r6)
85 addi r6,r6,8
86 8:
87 add r4,r7,r6 /* update src addr */
88
89
90
91 /* Dest is quadword aligned now.
92
93 Lots of decisions to make. If we are copying less than a cache
94 line we won't be here long. If we are not on a cache line
95 boundary we need to get there. And then we need to figure out
96 how many cache lines ahead to pre-touch.
97
98 r3 - return value (always)
99 r4 - current source addr
100 r5 - copy length
101 r6 - current dest addr
102 */
103
104
105 .align 4
106 L(dst_aligned):
107
108
109 #ifdef SHARED
110 mflr r0
111 /* Establishes GOT addressability so we can load __cache_line_size
112 from static. This value was set from the aux vector during startup. */
113 SETUP_GOT_ACCESS(r9,got_label)
114 addis r9,r9,__cache_line_size-got_label@ha
115 lwz r9,__cache_line_size-got_label@l(r9)
116 mtlr r0
117 #else
118 /* Load __cache_line_size from static. This value was set from the
119 aux vector during startup. */
120 lis r9,__cache_line_size@ha
121 lwz r9,__cache_line_size@l(r9)
122 #endif
123
124 cmplwi cr5, r9, 0
125 bne+ cr5,L(cachelineset)
126
127 /* __cache_line_size not set: generic byte copy without much optimization */
128 andi. r0,r5,1 /* If length is odd copy one byte. */
129 beq L(cachelinenotset_align)
130 lbz r7,0(r4) /* Read one byte from source. */
131 addi r5,r5,-1 /* Update length. */
132 addi r4,r4,1 /* Update source pointer address. */
133 stb r7,0(r6) /* Store one byte on dest. */
134 addi r6,r6,1 /* Update dest pointer address. */
135 L(cachelinenotset_align):
136 cmpwi cr7,r5,0 /* If length is 0 return. */
137 beqlr cr7
138 ori r2,r2,0 /* Force a new dispatch group. */
139 L(cachelinenotset_loop):
140 addic. r5,r5,-2 /* Update length. */
141 lbz r7,0(r4) /* Load 2 bytes from source. */
142 lbz r8,1(r4)
143 addi r4,r4,2 /* Update source pointer address. */
144 stb r7,0(r6) /* Store 2 bytes on dest. */
145 stb r8,1(r6)
146 addi r6,r6,2 /* Update dest pointer address. */
147 bne L(cachelinenotset_loop)
148 blr
149
150
151 L(cachelineset):
152
153 addi r10,r9,-1
154
155 cmpw cr5,r5,r10 /* Less than a cacheline to go? */
156
157 neg r7,r6 /* How far to next cacheline bdy? */
158
159 addi r6,r6,-8 /* prepare for stdu */
160 cmpwi cr0,r9,128
161 addi r4,r4,-8 /* prepare for ldu */
162
163
164 ble+ cr5,L(lessthancacheline)
165
166 beq- cr0,L(big_lines) /* 128 byte line code */
167
168
169
170
171 /* More than a cacheline left to go, and using 64 byte cachelines */
172
173 clrlwi r7,r7,32-6 /* How far to next cacheline bdy? */
174
175 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */
176
177 /* Reduce total len by what it takes to get to the next cache line */
178 subf r5,r7,r5
179 srwi r7,r7,4 /* How many qws to get to the line bdy? */
180
181 /* How many full cache lines to copy after getting to a line bdy? */
182 srwi r10,r5,6
183
184 cmplwi r10,0 /* If no full cache lines to copy ... */
185 li r11,0 /* number cachelines to copy with prefetch */
186 beq L(nocacheprefetch)
187
188
189 /* We are here because we have at least one full cache line to copy,
190 and therefore some pre-touching to do. */
191
192 cmplwi r10,PREFETCH_AHEAD
193 li r12,64+8 /* prefetch distance */
194 ble L(lessthanmaxprefetch)
195
196 /* We can only do so much pre-fetching. R11 will have the count of
197 lines left to prefetch after the initial batch of prefetches
198 are executed. */
199
200 subi r11,r10,PREFETCH_AHEAD
201 li r10,PREFETCH_AHEAD
202
203 L(lessthanmaxprefetch):
204 mtctr r10
205
206 /* At this point r10/ctr hold the number of lines to prefetch in this
207 initial batch, and r11 holds any remainder. */
208
209 L(prefetchSRC):
210 dcbt r12,r4
211 addi r12,r12,64
212 bdnz L(prefetchSRC)
213
214
215 /* Prefetching is done, or was not needed.
216
217 cr6 - are we on a cacheline boundary already?
218 r7 - number of quadwords to the next cacheline boundary
219 */
220
221 L(nocacheprefetch):
222 mtctr r7
223
224 cmplwi cr1,r5,64 /* Less than a cache line to copy? */
225
226 /* How many bytes are left after we copy whatever full
227 cache lines we can get? */
228 clrlwi r5,r5,32-6
229
230 beq cr6,L(cachelinealigned)
231
232
233 /* Copy quadwords up to the next cacheline boundary */
234
235 L(aligntocacheline):
236 lfd fp9,0x08(r4)
237 lfdu fp10,0x10(r4)
238 stfd fp9,0x08(r6)
239 stfdu fp10,0x10(r6)
240 bdnz L(aligntocacheline)
241
242
243 .align 4
244 L(cachelinealigned): /* copy while cache lines */
245
246 blt- cr1,L(lessthancacheline) /* size <64 */
247
248 L(outerloop):
249 cmpwi r11,0
250 mtctr r11
251 beq- L(endloop)
252
253 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
254
255 .align 4
256 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
257 L(loop): /* Copy aligned body */
258 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
259 lfd fp9, 0x08(r4)
260 dcbz r11,r6
261 lfd fp10, 0x10(r4)
262 lfd fp11, 0x18(r4)
263 lfd fp12, 0x20(r4)
264 stfd fp9, 0x08(r6)
265 stfd fp10, 0x10(r6)
266 stfd fp11, 0x18(r6)
267 stfd fp12, 0x20(r6)
268 lfd fp9, 0x28(r4)
269 lfd fp10, 0x30(r4)
270 lfd fp11, 0x38(r4)
271 lfdu fp12, 0x40(r4)
272 stfd fp9, 0x28(r6)
273 stfd fp10, 0x30(r6)
274 stfd fp11, 0x38(r6)
275 stfdu fp12, 0x40(r6)
276
277 bdnz L(loop)
278
279
280 L(endloop):
281 cmpwi r10,0
282 beq- L(endloop2)
283 mtctr r10
284
285 L(loop2): /* Copy aligned body */
286 lfd fp9, 0x08(r4)
287 lfd fp10, 0x10(r4)
288 lfd fp11, 0x18(r4)
289 lfd fp12, 0x20(r4)
290 stfd fp9, 0x08(r6)
291 stfd fp10, 0x10(r6)
292 stfd fp11, 0x18(r6)
293 stfd fp12, 0x20(r6)
294 lfd fp9, 0x28(r4)
295 lfd fp10, 0x30(r4)
296 lfd fp11, 0x38(r4)
297 lfdu fp12, 0x40(r4)
298 stfd fp9, 0x28(r6)
299 stfd fp10, 0x30(r6)
300 stfd fp11, 0x38(r6)
301 stfdu fp12, 0x40(r6)
302
303 bdnz L(loop2)
304 L(endloop2):
305
306
307 .align 4
308 L(lessthancacheline): /* Was there less than cache to do ? */
309 cmplwi cr0,r5,16
310 srwi r7,r5,4 /* divide size by 16 */
311 blt- L(do_lt16)
312 mtctr r7
313
314 L(copy_remaining):
315 lfd fp9, 0x08(r4)
316 lfdu fp10, 0x10(r4)
317 stfd fp9, 0x08(r6)
318 stfdu fp10, 0x10(r6)
319 bdnz L(copy_remaining)
320
321 L(do_lt16): /* less than 16 ? */
322 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
323 beqlr+ /* no rest to copy */
324 addi r4,r4,8
325 addi r6,r6,8
326
327 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
328 mtcrf 0x01,r5
329 sub r7,r4,r6
330 bf- cr7*4+0,8f
331 lfdx fp9,r7,r6 /* copy 8 byte */
332 stfd fp9,0(r6)
333 addi r6,r6,8
334 8:
335 bf cr7*4+1,4f
336 lwzx r0,r7,r6 /* copy 4 byte */
337 stw r0,0(r6)
338 addi r6,r6,4
339 4:
340 bf cr7*4+2,2f
341 lhzx r0,r7,r6 /* copy 2 byte */
342 sth r0,0(r6)
343 addi r6,r6,2
344 2:
345 bf cr7*4+3,1f
346 lbzx r0,r7,r6 /* copy 1 byte */
347 stb r0,0(r6)
348 1:
349 blr
350
351
352
353
354
355 /* Similar to above, but for use with 128 byte lines. */
356
357
358 L(big_lines):
359
360 clrlwi r7,r7,32-7 /* How far to next cacheline bdy? */
361
362 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */
363
364 /* Reduce total len by what it takes to get to the next cache line */
365 subf r5,r7,r5
366 srwi r7,r7,4 /* How many qw to get to the line bdy? */
367
368 /* How many full cache lines to copy after getting to a line bdy? */
369 srwi r10,r5,7
370
371 cmplwi r10,0 /* If no full cache lines to copy ... */
372 li r11,0 /* number cachelines to copy with prefetch */
373 beq L(nocacheprefetch_128)
374
375
376 /* We are here because we have at least one full cache line to copy,
377 and therefore some pre-touching to do. */
378
379 cmplwi r10,PREFETCH_AHEAD
380 li r12,128+8 /* prefetch distance */
381 ble L(lessthanmaxprefetch_128)
382
383 /* We can only do so much pre-fetching. R11 will have the count of
384 lines left to prefetch after the initial batch of prefetches
385 are executed. */
386
387 subi r11,r10,PREFETCH_AHEAD
388 li r10,PREFETCH_AHEAD
389
390 L(lessthanmaxprefetch_128):
391 mtctr r10
392
393 /* At this point r10/ctr hold the number of lines to prefetch in this
394 initial batch, and r11 holds any remainder. */
395
396 L(prefetchSRC_128):
397 dcbt r12,r4
398 addi r12,r12,128
399 bdnz L(prefetchSRC_128)
400
401
402 /* Prefetching is done, or was not needed.
403
404 cr6 - are we on a cacheline boundary already?
405 r7 - number of quadwords to the next cacheline boundary
406 */
407
408 L(nocacheprefetch_128):
409 mtctr r7
410
411 cmplwi cr1,r5,128 /* Less than a cache line to copy? */
412
413 /* How many bytes are left after we copy whatever full
414 cache lines we can get? */
415 clrlwi r5,r5,32-7
416
417 beq cr6,L(cachelinealigned_128)
418
419
420 /* Copy quadwords up to the next cacheline boundary */
421
422 L(aligntocacheline_128):
423 lfd fp9,0x08(r4)
424 lfdu fp10,0x10(r4)
425 stfd fp9,0x08(r6)
426 stfdu fp10,0x10(r6)
427 bdnz L(aligntocacheline_128)
428
429
430 L(cachelinealigned_128): /* copy while cache lines */
431
432 blt- cr1,L(lessthancacheline) /* size <128 */
433
434 L(outerloop_128):
435 cmpwi r11,0
436 mtctr r11
437 beq- L(endloop_128)
438
439 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
440
441 .align 4
442 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
443 L(loop_128): /* Copy aligned body */
444 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
445 lfd fp9, 0x08(r4)
446 dcbz r11,r6
447 lfd fp10, 0x10(r4)
448 lfd fp11, 0x18(r4)
449 lfd fp12, 0x20(r4)
450 stfd fp9, 0x08(r6)
451 stfd fp10, 0x10(r6)
452 stfd fp11, 0x18(r6)
453 stfd fp12, 0x20(r6)
454 lfd fp9, 0x28(r4)
455 lfd fp10, 0x30(r4)
456 lfd fp11, 0x38(r4)
457 lfd fp12, 0x40(r4)
458 stfd fp9, 0x28(r6)
459 stfd fp10, 0x30(r6)
460 stfd fp11, 0x38(r6)
461 stfd fp12, 0x40(r6)
462 lfd fp9, 0x48(r4)
463 lfd fp10, 0x50(r4)
464 lfd fp11, 0x58(r4)
465 lfd fp12, 0x60(r4)
466 stfd fp9, 0x48(r6)
467 stfd fp10, 0x50(r6)
468 stfd fp11, 0x58(r6)
469 stfd fp12, 0x60(r6)
470 lfd fp9, 0x68(r4)
471 lfd fp10, 0x70(r4)
472 lfd fp11, 0x78(r4)
473 lfdu fp12, 0x80(r4)
474 stfd fp9, 0x68(r6)
475 stfd fp10, 0x70(r6)
476 stfd fp11, 0x78(r6)
477 stfdu fp12, 0x80(r6)
478
479 bdnz L(loop_128)
480
481
482 L(endloop_128):
483 cmpwi r10,0
484 beq- L(endloop2_128)
485 mtctr r10
486
487 L(loop2_128): /* Copy aligned body */
488 lfd fp9, 0x08(r4)
489 lfd fp10, 0x10(r4)
490 lfd fp11, 0x18(r4)
491 lfd fp12, 0x20(r4)
492 stfd fp9, 0x08(r6)
493 stfd fp10, 0x10(r6)
494 stfd fp11, 0x18(r6)
495 stfd fp12, 0x20(r6)
496 lfd fp9, 0x28(r4)
497 lfd fp10, 0x30(r4)
498 lfd fp11, 0x38(r4)
499 lfd fp12, 0x40(r4)
500 stfd fp9, 0x28(r6)
501 stfd fp10, 0x30(r6)
502 stfd fp11, 0x38(r6)
503 stfd fp12, 0x40(r6)
504 lfd fp9, 0x48(r4)
505 lfd fp10, 0x50(r4)
506 lfd fp11, 0x58(r4)
507 lfd fp12, 0x60(r4)
508 stfd fp9, 0x48(r6)
509 stfd fp10, 0x50(r6)
510 stfd fp11, 0x58(r6)
511 stfd fp12, 0x60(r6)
512 lfd fp9, 0x68(r4)
513 lfd fp10, 0x70(r4)
514 lfd fp11, 0x78(r4)
515 lfdu fp12, 0x80(r4)
516 stfd fp9, 0x68(r6)
517 stfd fp10, 0x70(r6)
518 stfd fp11, 0x78(r6)
519 stfdu fp12, 0x80(r6)
520 bdnz L(loop2_128)
521 L(endloop2_128):
522
523 b L(lessthancacheline)
524
525
526 END (memcpy)
527 libc_hidden_builtin_def (memcpy)