]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/a2/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / a2 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2016 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
23 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
24
25 .section ".toc","aw"
26 .LC0:
27 .tc __cache_line_size[TC],__cache_line_size
28 .section ".text"
29 .align 2
30
31
32 .machine a2
33 EALIGN (memcpy, 5, 0)
34 CALL_MCOUNT 3
35
36 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
37 cmpldi cr1,r5,16 /* is size < 16 ? */
38 mr r6,r3 /* Copy dest reg to r6; */
39 blt+ cr1,L(shortcopy)
40
41
42 /* Big copy (16 bytes or more)
43
44 Figure out how far to the nearest quadword boundary, or if we are
45 on one already. Also get the cache line size.
46
47 r3 - return value (always)
48 r4 - current source addr
49 r5 - copy length
50 r6 - current dest addr
51 */
52
53 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
54 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
55 clrldi r8,r8,64-4 /* align to 16byte boundary */
56 sub r7,r4,r3 /* compute offset to src from dest */
57 lwz r9,0(r9) /* Get cache line size (part 2) */
58 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
59 addi r10,r9,-1 /* Cache line mask */
60 beq+ L(dst_aligned)
61
62
63
64 /* Destination is not aligned on quadword boundary. Get us to one.
65
66 r3 - return value (always)
67 r4 - current source addr
68 r5 - copy length
69 r6 - current dest addr
70 r7 - offset to src from dest
71 r8 - number of bytes to quadword boundary
72 */
73
74 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
75 subf r5,r8,r5 /* adjust remaining len */
76
77 bf cr7*4+3,1f
78 lbzx r0,r7,r6 /* copy 1 byte addr */
79 stb r0,0(r6)
80 addi r6,r6,1
81 1:
82 bf cr7*4+2,2f
83 lhzx r0,r7,r6 /* copy 2 byte addr */
84 sth r0,0(r6)
85 addi r6,r6,2
86 2:
87 bf cr7*4+1,4f
88 lwzx r0,r7,r6 /* copy 4 byte addr */
89 stw r0,0(r6)
90 addi r6,r6,4
91 4:
92 bf cr7*4+0,8f
93 ldx r0,r7,r6 /* copy 8 byte addr */
94 std r0,0(r6)
95 addi r6,r6,8
96 8:
97 add r4,r7,r6 /* update src addr */
98
99
100
101 /* Dest is quadword aligned now.
102
103 Lots of decisions to make. If we are copying less than a cache
104 line we won't be here long. If we are not on a cache line
105 boundary we need to get there. And then we need to figure out
106 how many cache lines ahead to pre-touch.
107
108 r3 - return value (always)
109 r4 - current source addr
110 r5 - copy length
111 r6 - current dest addr
112 */
113
114
115 .align 4
116 L(dst_aligned):
117 cmpdi cr0,r9,0 /* Cache line size set? */
118 bne+ cr0,L(cachelineset)
119
120 /* __cache_line_size not set: generic byte copy without much optimization */
121 clrldi. r0,r5,63 /* If length is odd copy one byte */
122 beq L(cachelinenotset_align)
123 lbz r7,0(r4) /* Read one byte from source */
124 addi r5,r5,-1 /* Update length */
125 addi r4,r4,1 /* Update source pointer address */
126 stb r7,0(r6) /* Store one byte at dest */
127 addi r6,r6,1 /* Update dest pointer address */
128 L(cachelinenotset_align):
129 cmpdi cr7,r5,0 /* If length is 0 return */
130 beqlr cr7
131 ori r2,r2,0 /* Force a new dispatch group */
132 L(cachelinenotset_loop):
133 addic. r5,r5,-2 /* Update length */
134 lbz r7,0(r4) /* Load 2 bytes from source */
135 lbz r8,1(r4)
136 addi r4,r4,2 /* Update source pointer address */
137 stb r7,0(r6) /* Store 2 bytes on dest */
138 stb r8,1(r6)
139 addi r6,r6,2 /* Update dest pointer address */
140 bne L(cachelinenotset_loop)
141 blr
142
143
144 L(cachelineset):
145 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
146
147 neg r7,r6 /* How far to next cacheline bdy? */
148
149 addi r6,r6,-8 /* prepare for stdu */
150 cmpdi cr0,r9,128
151 addi r4,r4,-8 /* prepare for ldu */
152
153
154 ble+ cr5,L(lessthancacheline)
155
156 beq- cr0,L(big_lines) /* 128 byte line code */
157
158
159
160 /* More than a cacheline left to go, and using 64 byte cachelines */
161
162 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
163
164 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
165
166 /* Reduce total len by what it takes to get to the next cache line */
167 subf r5,r7,r5
168 srdi r7,r7,4 /* How many qws to get to the line bdy? */
169
170 /* How many full cache lines to copy after getting to a line bdy? */
171 srdi r10,r5,6
172
173 cmpldi r10,0 /* If no full cache lines to copy ... */
174 li r11,0 /* number cachelines to copy with prefetch */
175 beq L(nocacheprefetch)
176
177
178 /* We are here because we have at least one full cache line to copy,
179 and therefore some pre-touching to do. */
180
181 cmpldi r10,PREFETCH_AHEAD
182 li r12,64+8 /* prefetch distance */
183 ble L(lessthanmaxprefetch)
184
185 /* We can only do so much pre-fetching. R11 will have the count of
186 lines left to prefetch after the initial batch of prefetches
187 are executed. */
188
189 subi r11,r10,PREFETCH_AHEAD
190 li r10,PREFETCH_AHEAD
191
192 L(lessthanmaxprefetch):
193 mtctr r10
194
195 /* At this point r10/ctr hold the number of lines to prefetch in this
196 initial batch, and r11 holds any remainder. */
197
198 L(prefetchSRC):
199 dcbt r12,r4
200 addi r12,r12,64
201 bdnz L(prefetchSRC)
202
203
204 /* Prefetching is done, or was not needed.
205
206 cr6 - are we on a cacheline boundary already?
207 r7 - number of quadwords to the next cacheline boundary
208 */
209
210 L(nocacheprefetch):
211 mtctr r7
212
213 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
214
215 /* How many bytes are left after we copy whatever full
216 cache lines we can get? */
217 clrldi r5,r5,64-6
218
219 beq cr6,L(cachelinealigned)
220
221
222 /* Copy quadwords up to the next cacheline boundary */
223
224 L(aligntocacheline):
225 ld r9,0x08(r4)
226 ld r7,0x10(r4)
227 addi r4,r4,0x10
228 std r9,0x08(r6)
229 stdu r7,0x10(r6)
230 bdnz L(aligntocacheline)
231
232
233 .align 4
234 L(cachelinealigned): /* copy while cache lines */
235
236 blt- cr1,L(lessthancacheline) /* size <64 */
237
238 L(outerloop):
239 cmpdi r11,0
240 mtctr r11
241 beq- L(endloop)
242
243 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
244
245 .align 4
246 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
247 L(loop): /* Copy aligned body */
248 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
249 ld r9, 0x08(r4)
250 dcbz r11,r6
251 ld r7, 0x10(r4)
252 ld r8, 0x18(r4)
253 ld r0, 0x20(r4)
254 std r9, 0x08(r6)
255 std r7, 0x10(r6)
256 std r8, 0x18(r6)
257 std r0, 0x20(r6)
258 ld r9, 0x28(r4)
259 ld r7, 0x30(r4)
260 ld r8, 0x38(r4)
261 ld r0, 0x40(r4)
262 addi r4, r4,0x40
263 std r9, 0x28(r6)
264 std r7, 0x30(r6)
265 std r8, 0x38(r6)
266 stdu r0, 0x40(r6)
267
268 bdnz L(loop)
269
270
271 L(endloop):
272 cmpdi r10,0
273 beq- L(endloop2)
274 mtctr r10
275
276 L(loop2): /* Copy aligned body */
277 ld r9, 0x08(r4)
278 ld r7, 0x10(r4)
279 ld r8, 0x18(r4)
280 ld r0, 0x20(r4)
281 std r9, 0x08(r6)
282 std r7, 0x10(r6)
283 std r8, 0x18(r6)
284 std r0, 0x20(r6)
285 ld r9, 0x28(r4)
286 ld r7, 0x30(r4)
287 ld r8, 0x38(r4)
288 ld r0, 0x40(r4)
289 addi r4, r4,0x40
290 std r9, 0x28(r6)
291 std r7, 0x30(r6)
292 std r8, 0x38(r6)
293 stdu r0, 0x40(r6)
294
295 bdnz L(loop2)
296 L(endloop2):
297
298
299 .align 4
300 L(lessthancacheline): /* Was there less than cache to do ? */
301 cmpldi cr0,r5,16
302 srdi r7,r5,4 /* divide size by 16 */
303 blt- L(do_lt16)
304 mtctr r7
305
306 L(copy_remaining):
307 ld r8,0x08(r4)
308 ld r7,0x10(r4)
309 addi r4,r4,0x10
310 std r8,0x08(r6)
311 stdu r7,0x10(r6)
312 bdnz L(copy_remaining)
313
314 L(do_lt16): /* less than 16 ? */
315 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
316 beqlr+ /* no rest to copy */
317 addi r4,r4,8
318 addi r6,r6,8
319
320 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
321 mtcrf 0x01,r5
322 sub r7,r4,r6
323 bf- cr7*4+0,8f
324 ldx r0,r7,r6 /* copy 8 byte */
325 std r0,0(r6)
326 addi r6,r6,8
327 8:
328 bf cr7*4+1,4f
329 lwzx r0,r7,r6 /* copy 4 byte */
330 stw r0,0(r6)
331 addi r6,r6,4
332 4:
333 bf cr7*4+2,2f
334 lhzx r0,r7,r6 /* copy 2 byte */
335 sth r0,0(r6)
336 addi r6,r6,2
337 2:
338 bf cr7*4+3,1f
339 lbzx r0,r7,r6 /* copy 1 byte */
340 stb r0,0(r6)
341 1:
342 blr
343
344
345
346
347
348 /* Similar to above, but for use with 128 byte lines. */
349
350
351 L(big_lines):
352
353 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
354
355 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
356
357 /* Reduce total len by what it takes to get to the next cache line */
358 subf r5,r7,r5
359 srdi r7,r7,4 /* How many qws to get to the line bdy? */
360
361 /* How many full cache lines to copy after getting to a line bdy? */
362 srdi r10,r5,7
363
364 cmpldi r10,0 /* If no full cache lines to copy ... */
365 li r11,0 /* number cachelines to copy with prefetch */
366 beq L(nocacheprefetch_128)
367
368
369 /* We are here because we have at least one full cache line to copy,
370 and therefore some pre-touching to do. */
371
372 cmpldi r10,PREFETCH_AHEAD
373 li r12,128+8 /* prefetch distance */
374 ble L(lessthanmaxprefetch_128)
375
376 /* We can only do so much pre-fetching. R11 will have the count of
377 lines left to prefetch after the initial batch of prefetches
378 are executed. */
379
380 subi r11,r10,PREFETCH_AHEAD
381 li r10,PREFETCH_AHEAD
382
383 L(lessthanmaxprefetch_128):
384 mtctr r10
385
386 /* At this point r10/ctr hold the number of lines to prefetch in this
387 initial batch, and r11 holds any remainder. */
388
389 L(prefetchSRC_128):
390 dcbt r12,r4
391 addi r12,r12,128
392 bdnz L(prefetchSRC_128)
393
394
395 /* Prefetching is done, or was not needed.
396
397 cr6 - are we on a cacheline boundary already?
398 r7 - number of quadwords to the next cacheline boundary
399 */
400
401 L(nocacheprefetch_128):
402 mtctr r7
403
404 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
405
406 /* How many bytes are left after we copy whatever full
407 cache lines we can get? */
408 clrldi r5,r5,64-7
409
410 beq cr6,L(cachelinealigned_128)
411
412
413 /* Copy quadwords up to the next cacheline boundary */
414
415 L(aligntocacheline_128):
416 ld r9,0x08(r4)
417 ld r7,0x10(r4)
418 addi r4,r4,0x10
419 std r9,0x08(r6)
420 stdu r7,0x10(r6)
421 bdnz L(aligntocacheline_128)
422
423
424 L(cachelinealigned_128): /* copy while cache lines */
425
426 blt- cr1,L(lessthancacheline) /* size <128 */
427
428 L(outerloop_128):
429 cmpdi r11,0
430 mtctr r11
431 beq- L(endloop_128)
432
433 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
434
435 .align 4
436 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
437 L(loop_128): /* Copy aligned body */
438 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
439 ld r9, 0x08(r4)
440 dcbz r11,r6
441 ld r7, 0x10(r4)
442 ld r8, 0x18(r4)
443 ld r0, 0x20(r4)
444 std r9, 0x08(r6)
445 std r7, 0x10(r6)
446 std r8, 0x18(r6)
447 std r0, 0x20(r6)
448 ld r9, 0x28(r4)
449 ld r7, 0x30(r4)
450 ld r8, 0x38(r4)
451 ld r0, 0x40(r4)
452 std r9, 0x28(r6)
453 std r7, 0x30(r6)
454 std r8, 0x38(r6)
455 std r0, 0x40(r6)
456 ld r9, 0x48(r4)
457 ld r7, 0x50(r4)
458 ld r8, 0x58(r4)
459 ld r0, 0x60(r4)
460 std r9, 0x48(r6)
461 std r7, 0x50(r6)
462 std r8, 0x58(r6)
463 std r0, 0x60(r6)
464 ld r9, 0x68(r4)
465 ld r7, 0x70(r4)
466 ld r8, 0x78(r4)
467 ld r0, 0x80(r4)
468 addi r4, r4,0x80
469 std r9, 0x68(r6)
470 std r7, 0x70(r6)
471 std r8, 0x78(r6)
472 stdu r0, 0x80(r6)
473
474 bdnz L(loop_128)
475
476
477 L(endloop_128):
478 cmpdi r10,0
479 beq- L(endloop2_128)
480 mtctr r10
481
482 L(loop2_128): /* Copy aligned body */
483 ld r9, 0x08(r4)
484 ld r7, 0x10(r4)
485 ld r8, 0x18(r4)
486 ld r0, 0x20(r4)
487 std r9, 0x08(r6)
488 std r7, 0x10(r6)
489 std r8, 0x18(r6)
490 std r0, 0x20(r6)
491 ld r9, 0x28(r4)
492 ld r7, 0x30(r4)
493 ld r8, 0x38(r4)
494 ld r0, 0x40(r4)
495 std r9, 0x28(r6)
496 std r7, 0x30(r6)
497 std r8, 0x38(r6)
498 std r0, 0x40(r6)
499 ld r9, 0x48(r4)
500 ld r7, 0x50(r4)
501 ld r8, 0x58(r4)
502 ld r0, 0x60(r4)
503 std r9, 0x48(r6)
504 std r7, 0x50(r6)
505 std r8, 0x58(r6)
506 std r0, 0x60(r6)
507 ld r9, 0x68(r4)
508 ld r7, 0x70(r4)
509 ld r8, 0x78(r4)
510 ld r0, 0x80(r4)
511 addi r4, r4,0x80
512 std r9, 0x68(r6)
513 std r7, 0x70(r6)
514 std r8, 0x78(r6)
515 stdu r0, 0x80(r6)
516
517 bdnz L(loop2_128)
518 L(endloop2_128):
519
520 b L(lessthancacheline)
521
522
523 END_GEN_TB (memcpy,TB_TOCLESS)
524 libc_hidden_builtin_def (memcpy)