]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/a2/memcpy.S
Update copyright notices with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / a2 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21 #include <bp-sym.h>
22 #include <bp-asm.h>
23
24 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
25 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
26
27 .section ".toc","aw"
28 .LC0:
29 .tc __cache_line_size[TC],__cache_line_size
30 .section ".text"
31 .align 2
32
33
34 .machine a2
35 EALIGN (BP_SYM (memcpy), 5, 0)
36 CALL_MCOUNT 3
37
38 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
39 cmpldi cr1,r5,16 /* is size < 16 ? */
40 mr r6,r3 /* Copy dest reg to r6; */
41 blt+ cr1,L(shortcopy)
42
43
44 /* Big copy (16 bytes or more)
45
46 Figure out how far to the nearest quadword boundary, or if we are
47 on one already. Also get the cache line size.
48
49 r3 - return value (always)
50 r4 - current source addr
51 r5 - copy length
52 r6 - current dest addr
53 */
54
55 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
56 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
57 clrldi r8,r8,64-4 /* align to 16byte boundary */
58 sub r7,r4,r3 /* compute offset to src from dest */
59 lwz r9,0(r9) /* Get cache line size (part 2) */
60 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
61 addi r10,r9,-1 /* Cache line mask */
62 beq+ L(dst_aligned)
63
64
65
66 /* Destination is not aligned on quadword boundary. Get us to one.
67
68 r3 - return value (always)
69 r4 - current source addr
70 r5 - copy length
71 r6 - current dest addr
72 r7 - offset to src from dest
73 r8 - number of bytes to quadword boundary
74 */
75
76 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
77 subf r5,r8,r5 /* adjust remaining len */
78
79 bf cr7*4+3,1f
80 lbzx r0,r7,r6 /* copy 1 byte addr */
81 stb r0,0(r6)
82 addi r6,r6,1
83 1:
84 bf cr7*4+2,2f
85 lhzx r0,r7,r6 /* copy 2 byte addr */
86 sth r0,0(r6)
87 addi r6,r6,2
88 2:
89 bf cr7*4+1,4f
90 lwzx r0,r7,r6 /* copy 4 byte addr */
91 stw r0,0(r6)
92 addi r6,r6,4
93 4:
94 bf cr7*4+0,8f
95 ldx r0,r7,r6 /* copy 8 byte addr */
96 std r0,0(r6)
97 addi r6,r6,8
98 8:
99 add r4,r7,r6 /* update src addr */
100
101
102
103 /* Dest is quadword aligned now.
104
105 Lots of decisions to make. If we are copying less than a cache
106 line we won't be here long. If we are not on a cache line
107 boundary we need to get there. And then we need to figure out
108 how many cache lines ahead to pre-touch.
109
110 r3 - return value (always)
111 r4 - current source addr
112 r5 - copy length
113 r6 - current dest addr
114 */
115
116
117 .align 4
118 L(dst_aligned):
119 cmpdi cr0,r9,0 /* Cache line size set? */
120 bne+ cr0,L(cachelineset)
121
122 /* __cache_line_size not set: generic byte copy without much optimization */
123 clrldi. r0,r5,63 /* If length is odd copy one byte */
124 beq L(cachelinenotset_align)
125 lbz r7,0(r4) /* Read one byte from source */
126 addi r5,r5,-1 /* Update length */
127 addi r4,r4,1 /* Update source pointer address */
128 stb r7,0(r6) /* Store one byte at dest */
129 addi r6,r6,1 /* Update dest pointer address */
130 L(cachelinenotset_align):
131 cmpdi cr7,r5,0 /* If length is 0 return */
132 beqlr cr7
133 ori r2,r2,0 /* Force a new dispatch group */
134 L(cachelinenotset_loop):
135 addic. r5,r5,-2 /* Update length */
136 lbz r7,0(r4) /* Load 2 bytes from source */
137 lbz r8,1(r4)
138 addi r4,r4,2 /* Update source pointer address */
139 stb r7,0(r6) /* Store 2 bytes on dest */
140 stb r8,1(r6)
141 addi r6,r6,2 /* Update dest pointer address */
142 bne L(cachelinenotset_loop)
143 blr
144
145
146 L(cachelineset):
147 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
148
149 neg r7,r6 /* How far to next cacheline bdy? */
150
151 addi r6,r6,-8 /* prepare for stdu */
152 cmpdi cr0,r9,128
153 addi r4,r4,-8 /* prepare for ldu */
154
155
156 ble+ cr5,L(lessthancacheline)
157
158 beq- cr0,L(big_lines) /* 128 byte line code */
159
160
161
162 /* More than a cacheline left to go, and using 64 byte cachelines */
163
164 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
165
166 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
167
168 /* Reduce total len by what it takes to get to the next cache line */
169 subf r5,r7,r5
170 srdi r7,r7,4 /* How many qws to get to the line bdy? */
171
172 /* How many full cache lines to copy after getting to a line bdy? */
173 srdi r10,r5,6
174
175 cmpldi r10,0 /* If no full cache lines to copy ... */
176 li r11,0 /* number cachelines to copy with prefetch */
177 beq L(nocacheprefetch)
178
179
180 /* We are here because we have at least one full cache line to copy,
181 and therefore some pre-touching to do. */
182
183 cmpldi r10,PREFETCH_AHEAD
184 li r12,64+8 /* prefetch distance */
185 ble L(lessthanmaxprefetch)
186
187 /* We can only do so much pre-fetching. R11 will have the count of
188 lines left to prefetch after the initial batch of prefetches
189 are executed. */
190
191 subi r11,r10,PREFETCH_AHEAD
192 li r10,PREFETCH_AHEAD
193
194 L(lessthanmaxprefetch):
195 mtctr r10
196
197 /* At this point r10/ctr hold the number of lines to prefetch in this
198 initial batch, and r11 holds any remainder. */
199
200 L(prefetchSRC):
201 dcbt r12,r4
202 addi r12,r12,64
203 bdnz L(prefetchSRC)
204
205
206 /* Prefetching is done, or was not needed.
207
208 cr6 - are we on a cacheline boundary already?
209 r7 - number of quadwords to the next cacheline boundary
210 */
211
212 L(nocacheprefetch):
213 mtctr r7
214
215 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
216
217 /* How many bytes are left after we copy whatever full
218 cache lines we can get? */
219 clrldi r5,r5,64-6
220
221 beq cr6,L(cachelinealigned)
222
223
224 /* Copy quadwords up to the next cacheline boundary */
225
226 L(aligntocacheline):
227 ld r9,0x08(r4)
228 ld r7,0x10(r4)
229 addi r4,r4,0x10
230 std r9,0x08(r6)
231 stdu r7,0x10(r6)
232 bdnz L(aligntocacheline)
233
234
235 .align 4
236 L(cachelinealigned): /* copy while cache lines */
237
238 blt- cr1,L(lessthancacheline) /* size <64 */
239
240 L(outerloop):
241 cmpdi r11,0
242 mtctr r11
243 beq- L(endloop)
244
245 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
246
247 .align 4
248 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
249 L(loop): /* Copy aligned body */
250 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
251 ld r9, 0x08(r4)
252 dcbz r11,r6
253 ld r7, 0x10(r4)
254 ld r8, 0x18(r4)
255 ld r0, 0x20(r4)
256 std r9, 0x08(r6)
257 std r7, 0x10(r6)
258 std r8, 0x18(r6)
259 std r0, 0x20(r6)
260 ld r9, 0x28(r4)
261 ld r7, 0x30(r4)
262 ld r8, 0x38(r4)
263 ld r0, 0x40(r4)
264 addi r4, r4,0x40
265 std r9, 0x28(r6)
266 std r7, 0x30(r6)
267 std r8, 0x38(r6)
268 stdu r0, 0x40(r6)
269
270 bdnz L(loop)
271
272
273 L(endloop):
274 cmpdi r10,0
275 beq- L(endloop2)
276 mtctr r10
277
278 L(loop2): /* Copy aligned body */
279 ld r9, 0x08(r4)
280 ld r7, 0x10(r4)
281 ld r8, 0x18(r4)
282 ld r0, 0x20(r4)
283 std r9, 0x08(r6)
284 std r7, 0x10(r6)
285 std r8, 0x18(r6)
286 std r0, 0x20(r6)
287 ld r9, 0x28(r4)
288 ld r7, 0x30(r4)
289 ld r8, 0x38(r4)
290 ld r0, 0x40(r4)
291 addi r4, r4,0x40
292 std r9, 0x28(r6)
293 std r7, 0x30(r6)
294 std r8, 0x38(r6)
295 stdu r0, 0x40(r6)
296
297 bdnz L(loop2)
298 L(endloop2):
299
300
301 .align 4
302 L(lessthancacheline): /* Was there less than cache to do ? */
303 cmpldi cr0,r5,16
304 srdi r7,r5,4 /* divide size by 16 */
305 blt- L(do_lt16)
306 mtctr r7
307
308 L(copy_remaining):
309 ld r8,0x08(r4)
310 ld r7,0x10(r4)
311 addi r4,r4,0x10
312 std r8,0x08(r6)
313 stdu r7,0x10(r6)
314 bdnz L(copy_remaining)
315
316 L(do_lt16): /* less than 16 ? */
317 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
318 beqlr+ /* no rest to copy */
319 addi r4,r4,8
320 addi r6,r6,8
321
322 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
323 mtcrf 0x01,r5
324 sub r7,r4,r6
325 bf- cr7*4+0,8f
326 ldx r0,r7,r6 /* copy 8 byte */
327 std r0,0(r6)
328 addi r6,r6,8
329 8:
330 bf cr7*4+1,4f
331 lwzx r0,r7,r6 /* copy 4 byte */
332 stw r0,0(r6)
333 addi r6,r6,4
334 4:
335 bf cr7*4+2,2f
336 lhzx r0,r7,r6 /* copy 2 byte */
337 sth r0,0(r6)
338 addi r6,r6,2
339 2:
340 bf cr7*4+3,1f
341 lbzx r0,r7,r6 /* copy 1 byte */
342 stb r0,0(r6)
343 1:
344 blr
345
346
347
348
349
350 /* Similar to above, but for use with 128 byte lines. */
351
352
353 L(big_lines):
354
355 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
356
357 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
358
359 /* Reduce total len by what it takes to get to the next cache line */
360 subf r5,r7,r5
361 srdi r7,r7,4 /* How many qws to get to the line bdy? */
362
363 /* How many full cache lines to copy after getting to a line bdy? */
364 srdi r10,r5,7
365
366 cmpldi r10,0 /* If no full cache lines to copy ... */
367 li r11,0 /* number cachelines to copy with prefetch */
368 beq L(nocacheprefetch_128)
369
370
371 /* We are here because we have at least one full cache line to copy,
372 and therefore some pre-touching to do. */
373
374 cmpldi r10,PREFETCH_AHEAD
375 li r12,128+8 /* prefetch distance */
376 ble L(lessthanmaxprefetch_128)
377
378 /* We can only do so much pre-fetching. R11 will have the count of
379 lines left to prefetch after the initial batch of prefetches
380 are executed. */
381
382 subi r11,r10,PREFETCH_AHEAD
383 li r10,PREFETCH_AHEAD
384
385 L(lessthanmaxprefetch_128):
386 mtctr r10
387
388 /* At this point r10/ctr hold the number of lines to prefetch in this
389 initial batch, and r11 holds any remainder. */
390
391 L(prefetchSRC_128):
392 dcbt r12,r4
393 addi r12,r12,128
394 bdnz L(prefetchSRC_128)
395
396
397 /* Prefetching is done, or was not needed.
398
399 cr6 - are we on a cacheline boundary already?
400 r7 - number of quadwords to the next cacheline boundary
401 */
402
403 L(nocacheprefetch_128):
404 mtctr r7
405
406 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
407
408 /* How many bytes are left after we copy whatever full
409 cache lines we can get? */
410 clrldi r5,r5,64-7
411
412 beq cr6,L(cachelinealigned_128)
413
414
415 /* Copy quadwords up to the next cacheline boundary */
416
417 L(aligntocacheline_128):
418 ld r9,0x08(r4)
419 ld r7,0x10(r4)
420 addi r4,r4,0x10
421 std r9,0x08(r6)
422 stdu r7,0x10(r6)
423 bdnz L(aligntocacheline_128)
424
425
426 L(cachelinealigned_128): /* copy while cache lines */
427
428 blt- cr1,L(lessthancacheline) /* size <128 */
429
430 L(outerloop_128):
431 cmpdi r11,0
432 mtctr r11
433 beq- L(endloop_128)
434
435 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
436
437 .align 4
438 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
439 L(loop_128): /* Copy aligned body */
440 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
441 ld r9, 0x08(r4)
442 dcbz r11,r6
443 ld r7, 0x10(r4)
444 ld r8, 0x18(r4)
445 ld r0, 0x20(r4)
446 std r9, 0x08(r6)
447 std r7, 0x10(r6)
448 std r8, 0x18(r6)
449 std r0, 0x20(r6)
450 ld r9, 0x28(r4)
451 ld r7, 0x30(r4)
452 ld r8, 0x38(r4)
453 ld r0, 0x40(r4)
454 std r9, 0x28(r6)
455 std r7, 0x30(r6)
456 std r8, 0x38(r6)
457 std r0, 0x40(r6)
458 ld r9, 0x48(r4)
459 ld r7, 0x50(r4)
460 ld r8, 0x58(r4)
461 ld r0, 0x60(r4)
462 std r9, 0x48(r6)
463 std r7, 0x50(r6)
464 std r8, 0x58(r6)
465 std r0, 0x60(r6)
466 ld r9, 0x68(r4)
467 ld r7, 0x70(r4)
468 ld r8, 0x78(r4)
469 ld r0, 0x80(r4)
470 addi r4, r4,0x80
471 std r9, 0x68(r6)
472 std r7, 0x70(r6)
473 std r8, 0x78(r6)
474 stdu r0, 0x80(r6)
475
476 bdnz L(loop_128)
477
478
479 L(endloop_128):
480 cmpdi r10,0
481 beq- L(endloop2_128)
482 mtctr r10
483
484 L(loop2_128): /* Copy aligned body */
485 ld r9, 0x08(r4)
486 ld r7, 0x10(r4)
487 ld r8, 0x18(r4)
488 ld r0, 0x20(r4)
489 std r9, 0x08(r6)
490 std r7, 0x10(r6)
491 std r8, 0x18(r6)
492 std r0, 0x20(r6)
493 ld r9, 0x28(r4)
494 ld r7, 0x30(r4)
495 ld r8, 0x38(r4)
496 ld r0, 0x40(r4)
497 std r9, 0x28(r6)
498 std r7, 0x30(r6)
499 std r8, 0x38(r6)
500 std r0, 0x40(r6)
501 ld r9, 0x48(r4)
502 ld r7, 0x50(r4)
503 ld r8, 0x58(r4)
504 ld r0, 0x60(r4)
505 std r9, 0x48(r6)
506 std r7, 0x50(r6)
507 std r8, 0x58(r6)
508 std r0, 0x60(r6)
509 ld r9, 0x68(r4)
510 ld r7, 0x70(r4)
511 ld r8, 0x78(r4)
512 ld r0, 0x80(r4)
513 addi r4, r4,0x80
514 std r9, 0x68(r6)
515 std r7, 0x70(r6)
516 std r8, 0x78(r6)
517 stdu r0, 0x80(r6)
518
519 bdnz L(loop2_128)
520 L(endloop2_128):
521
522 b L(lessthancacheline)
523
524
525 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
526 libc_hidden_builtin_def (memcpy)