]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/a2/memcpy.S
Remove "Contributed by" lines
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / a2 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include <rtld-global-offsets.h>
21
22 #ifndef MEMCPY
23 # define MEMCPY memcpy
24 #endif
25
26 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
27 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
28
29 .section ".toc","aw"
30 __GLRO_DEF(dl_cache_line_size)
31
32
33 .section ".text"
34 .align 2
35
36
37 .machine a2
38 ENTRY (MEMCPY, 5)
39 CALL_MCOUNT 3
40
41 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
42 cmpldi cr1,r5,16 /* is size < 16 ? */
43 mr r6,r3 /* Copy dest reg to r6; */
44 blt+ cr1,L(shortcopy)
45
46
47 /* Big copy (16 bytes or more)
48
49 Figure out how far to the nearest quadword boundary, or if we are
50 on one already. Also get the cache line size.
51
52 r3 - return value (always)
53 r4 - current source addr
54 r5 - copy length
55 r6 - current dest addr
56 */
57
58 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
59 /* Get the cache line size. */
60 __GLRO (r9, dl_cache_line_size,
61 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
62 clrldi r8,r8,64-4 /* align to 16byte boundary */
63 sub r7,r4,r3 /* compute offset to src from dest */
64 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
65 addi r10,r9,-1 /* Cache line mask */
66 beq+ L(dst_aligned)
67
68
69
70 /* Destination is not aligned on quadword boundary. Get us to one.
71
72 r3 - return value (always)
73 r4 - current source addr
74 r5 - copy length
75 r6 - current dest addr
76 r7 - offset to src from dest
77 r8 - number of bytes to quadword boundary
78 */
79
80 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
81 subf r5,r8,r5 /* adjust remaining len */
82
83 bf cr7*4+3,1f
84 lbzx r0,r7,r6 /* copy 1 byte addr */
85 stb r0,0(r6)
86 addi r6,r6,1
87 1:
88 bf cr7*4+2,2f
89 lhzx r0,r7,r6 /* copy 2 byte addr */
90 sth r0,0(r6)
91 addi r6,r6,2
92 2:
93 bf cr7*4+1,4f
94 lwzx r0,r7,r6 /* copy 4 byte addr */
95 stw r0,0(r6)
96 addi r6,r6,4
97 4:
98 bf cr7*4+0,8f
99 ldx r0,r7,r6 /* copy 8 byte addr */
100 std r0,0(r6)
101 addi r6,r6,8
102 8:
103 add r4,r7,r6 /* update src addr */
104
105
106
107 /* Dest is quadword aligned now.
108
109 Lots of decisions to make. If we are copying less than a cache
110 line we won't be here long. If we are not on a cache line
111 boundary we need to get there. And then we need to figure out
112 how many cache lines ahead to pre-touch.
113
114 r3 - return value (always)
115 r4 - current source addr
116 r5 - copy length
117 r6 - current dest addr
118 */
119
120
121 .align 4
122 L(dst_aligned):
123 cmpdi cr0,r9,0 /* Cache line size set? */
124 bne+ cr0,L(cachelineset)
125
126 /* Cache line size not set: generic byte copy without much optimization */
127 clrldi. r0,r5,63 /* If length is odd copy one byte */
128 beq L(cachelinenotset_align)
129 lbz r7,0(r4) /* Read one byte from source */
130 addi r5,r5,-1 /* Update length */
131 addi r4,r4,1 /* Update source pointer address */
132 stb r7,0(r6) /* Store one byte at dest */
133 addi r6,r6,1 /* Update dest pointer address */
134 L(cachelinenotset_align):
135 cmpdi cr7,r5,0 /* If length is 0 return */
136 beqlr cr7
137 ori r2,r2,0 /* Force a new dispatch group */
138 L(cachelinenotset_loop):
139 addic. r5,r5,-2 /* Update length */
140 lbz r7,0(r4) /* Load 2 bytes from source */
141 lbz r8,1(r4)
142 addi r4,r4,2 /* Update source pointer address */
143 stb r7,0(r6) /* Store 2 bytes on dest */
144 stb r8,1(r6)
145 addi r6,r6,2 /* Update dest pointer address */
146 bne L(cachelinenotset_loop)
147 blr
148
149
150 L(cachelineset):
151 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
152
153 neg r7,r6 /* How far to next cacheline bdy? */
154
155 addi r6,r6,-8 /* prepare for stdu */
156 cmpdi cr0,r9,128
157 addi r4,r4,-8 /* prepare for ldu */
158
159
160 ble+ cr5,L(lessthancacheline)
161
162 beq- cr0,L(big_lines) /* 128 byte line code */
163
164
165
166 /* More than a cacheline left to go, and using 64 byte cachelines */
167
168 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
169
170 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
171
172 /* Reduce total len by what it takes to get to the next cache line */
173 subf r5,r7,r5
174 srdi r7,r7,4 /* How many qws to get to the line bdy? */
175
176 /* How many full cache lines to copy after getting to a line bdy? */
177 srdi r10,r5,6
178
179 cmpldi r10,0 /* If no full cache lines to copy ... */
180 li r11,0 /* number cachelines to copy with prefetch */
181 beq L(nocacheprefetch)
182
183
184 /* We are here because we have at least one full cache line to copy,
185 and therefore some pre-touching to do. */
186
187 cmpldi r10,PREFETCH_AHEAD
188 li r12,64+8 /* prefetch distance */
189 ble L(lessthanmaxprefetch)
190
191 /* We can only do so much pre-fetching. R11 will have the count of
192 lines left to prefetch after the initial batch of prefetches
193 are executed. */
194
195 subi r11,r10,PREFETCH_AHEAD
196 li r10,PREFETCH_AHEAD
197
198 L(lessthanmaxprefetch):
199 mtctr r10
200
201 /* At this point r10/ctr hold the number of lines to prefetch in this
202 initial batch, and r11 holds any remainder. */
203
204 L(prefetchSRC):
205 dcbt r12,r4
206 addi r12,r12,64
207 bdnz L(prefetchSRC)
208
209
210 /* Prefetching is done, or was not needed.
211
212 cr6 - are we on a cacheline boundary already?
213 r7 - number of quadwords to the next cacheline boundary
214 */
215
216 L(nocacheprefetch):
217 mtctr r7
218
219 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
220
221 /* How many bytes are left after we copy whatever full
222 cache lines we can get? */
223 clrldi r5,r5,64-6
224
225 beq cr6,L(cachelinealigned)
226
227
228 /* Copy quadwords up to the next cacheline boundary */
229
230 L(aligntocacheline):
231 ld r9,0x08(r4)
232 ld r7,0x10(r4)
233 addi r4,r4,0x10
234 std r9,0x08(r6)
235 stdu r7,0x10(r6)
236 bdnz L(aligntocacheline)
237
238
239 .align 4
240 L(cachelinealigned): /* copy while cache lines */
241
242 blt- cr1,L(lessthancacheline) /* size <64 */
243
244 L(outerloop):
245 cmpdi r11,0
246 mtctr r11
247 beq- L(endloop)
248
249 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
250
251 .align 4
252 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
253 L(loop): /* Copy aligned body */
254 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
255 ld r9, 0x08(r4)
256 dcbz r11,r6
257 ld r7, 0x10(r4)
258 ld r8, 0x18(r4)
259 ld r0, 0x20(r4)
260 std r9, 0x08(r6)
261 std r7, 0x10(r6)
262 std r8, 0x18(r6)
263 std r0, 0x20(r6)
264 ld r9, 0x28(r4)
265 ld r7, 0x30(r4)
266 ld r8, 0x38(r4)
267 ld r0, 0x40(r4)
268 addi r4, r4,0x40
269 std r9, 0x28(r6)
270 std r7, 0x30(r6)
271 std r8, 0x38(r6)
272 stdu r0, 0x40(r6)
273
274 bdnz L(loop)
275
276
277 L(endloop):
278 cmpdi r10,0
279 beq- L(endloop2)
280 mtctr r10
281
282 L(loop2): /* Copy aligned body */
283 ld r9, 0x08(r4)
284 ld r7, 0x10(r4)
285 ld r8, 0x18(r4)
286 ld r0, 0x20(r4)
287 std r9, 0x08(r6)
288 std r7, 0x10(r6)
289 std r8, 0x18(r6)
290 std r0, 0x20(r6)
291 ld r9, 0x28(r4)
292 ld r7, 0x30(r4)
293 ld r8, 0x38(r4)
294 ld r0, 0x40(r4)
295 addi r4, r4,0x40
296 std r9, 0x28(r6)
297 std r7, 0x30(r6)
298 std r8, 0x38(r6)
299 stdu r0, 0x40(r6)
300
301 bdnz L(loop2)
302 L(endloop2):
303
304
305 .align 4
306 L(lessthancacheline): /* Was there less than cache to do ? */
307 cmpldi cr0,r5,16
308 srdi r7,r5,4 /* divide size by 16 */
309 blt- L(do_lt16)
310 mtctr r7
311
312 L(copy_remaining):
313 ld r8,0x08(r4)
314 ld r7,0x10(r4)
315 addi r4,r4,0x10
316 std r8,0x08(r6)
317 stdu r7,0x10(r6)
318 bdnz L(copy_remaining)
319
320 L(do_lt16): /* less than 16 ? */
321 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
322 beqlr+ /* no rest to copy */
323 addi r4,r4,8
324 addi r6,r6,8
325
326 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
327 mtcrf 0x01,r5
328 sub r7,r4,r6
329 bf- cr7*4+0,8f
330 ldx r0,r7,r6 /* copy 8 byte */
331 std r0,0(r6)
332 addi r6,r6,8
333 8:
334 bf cr7*4+1,4f
335 lwzx r0,r7,r6 /* copy 4 byte */
336 stw r0,0(r6)
337 addi r6,r6,4
338 4:
339 bf cr7*4+2,2f
340 lhzx r0,r7,r6 /* copy 2 byte */
341 sth r0,0(r6)
342 addi r6,r6,2
343 2:
344 bf cr7*4+3,1f
345 lbzx r0,r7,r6 /* copy 1 byte */
346 stb r0,0(r6)
347 1:
348 blr
349
350
351
352
353
354 /* Similar to above, but for use with 128 byte lines. */
355
356
357 L(big_lines):
358
359 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
360
361 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
362
363 /* Reduce total len by what it takes to get to the next cache line */
364 subf r5,r7,r5
365 srdi r7,r7,4 /* How many qws to get to the line bdy? */
366
367 /* How many full cache lines to copy after getting to a line bdy? */
368 srdi r10,r5,7
369
370 cmpldi r10,0 /* If no full cache lines to copy ... */
371 li r11,0 /* number cachelines to copy with prefetch */
372 beq L(nocacheprefetch_128)
373
374
375 /* We are here because we have at least one full cache line to copy,
376 and therefore some pre-touching to do. */
377
378 cmpldi r10,PREFETCH_AHEAD
379 li r12,128+8 /* prefetch distance */
380 ble L(lessthanmaxprefetch_128)
381
382 /* We can only do so much pre-fetching. R11 will have the count of
383 lines left to prefetch after the initial batch of prefetches
384 are executed. */
385
386 subi r11,r10,PREFETCH_AHEAD
387 li r10,PREFETCH_AHEAD
388
389 L(lessthanmaxprefetch_128):
390 mtctr r10
391
392 /* At this point r10/ctr hold the number of lines to prefetch in this
393 initial batch, and r11 holds any remainder. */
394
395 L(prefetchSRC_128):
396 dcbt r12,r4
397 addi r12,r12,128
398 bdnz L(prefetchSRC_128)
399
400
401 /* Prefetching is done, or was not needed.
402
403 cr6 - are we on a cacheline boundary already?
404 r7 - number of quadwords to the next cacheline boundary
405 */
406
407 L(nocacheprefetch_128):
408 mtctr r7
409
410 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
411
412 /* How many bytes are left after we copy whatever full
413 cache lines we can get? */
414 clrldi r5,r5,64-7
415
416 beq cr6,L(cachelinealigned_128)
417
418
419 /* Copy quadwords up to the next cacheline boundary */
420
421 L(aligntocacheline_128):
422 ld r9,0x08(r4)
423 ld r7,0x10(r4)
424 addi r4,r4,0x10
425 std r9,0x08(r6)
426 stdu r7,0x10(r6)
427 bdnz L(aligntocacheline_128)
428
429
430 L(cachelinealigned_128): /* copy while cache lines */
431
432 blt- cr1,L(lessthancacheline) /* size <128 */
433
434 L(outerloop_128):
435 cmpdi r11,0
436 mtctr r11
437 beq- L(endloop_128)
438
439 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
440
441 .align 4
442 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
443 L(loop_128): /* Copy aligned body */
444 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
445 ld r9, 0x08(r4)
446 dcbz r11,r6
447 ld r7, 0x10(r4)
448 ld r8, 0x18(r4)
449 ld r0, 0x20(r4)
450 std r9, 0x08(r6)
451 std r7, 0x10(r6)
452 std r8, 0x18(r6)
453 std r0, 0x20(r6)
454 ld r9, 0x28(r4)
455 ld r7, 0x30(r4)
456 ld r8, 0x38(r4)
457 ld r0, 0x40(r4)
458 std r9, 0x28(r6)
459 std r7, 0x30(r6)
460 std r8, 0x38(r6)
461 std r0, 0x40(r6)
462 ld r9, 0x48(r4)
463 ld r7, 0x50(r4)
464 ld r8, 0x58(r4)
465 ld r0, 0x60(r4)
466 std r9, 0x48(r6)
467 std r7, 0x50(r6)
468 std r8, 0x58(r6)
469 std r0, 0x60(r6)
470 ld r9, 0x68(r4)
471 ld r7, 0x70(r4)
472 ld r8, 0x78(r4)
473 ld r0, 0x80(r4)
474 addi r4, r4,0x80
475 std r9, 0x68(r6)
476 std r7, 0x70(r6)
477 std r8, 0x78(r6)
478 stdu r0, 0x80(r6)
479
480 bdnz L(loop_128)
481
482
483 L(endloop_128):
484 cmpdi r10,0
485 beq- L(endloop2_128)
486 mtctr r10
487
488 L(loop2_128): /* Copy aligned body */
489 ld r9, 0x08(r4)
490 ld r7, 0x10(r4)
491 ld r8, 0x18(r4)
492 ld r0, 0x20(r4)
493 std r9, 0x08(r6)
494 std r7, 0x10(r6)
495 std r8, 0x18(r6)
496 std r0, 0x20(r6)
497 ld r9, 0x28(r4)
498 ld r7, 0x30(r4)
499 ld r8, 0x38(r4)
500 ld r0, 0x40(r4)
501 std r9, 0x28(r6)
502 std r7, 0x30(r6)
503 std r8, 0x38(r6)
504 std r0, 0x40(r6)
505 ld r9, 0x48(r4)
506 ld r7, 0x50(r4)
507 ld r8, 0x58(r4)
508 ld r0, 0x60(r4)
509 std r9, 0x48(r6)
510 std r7, 0x50(r6)
511 std r8, 0x58(r6)
512 std r0, 0x60(r6)
513 ld r9, 0x68(r4)
514 ld r7, 0x70(r4)
515 ld r8, 0x78(r4)
516 ld r0, 0x80(r4)
517 addi r4, r4,0x80
518 std r9, 0x68(r6)
519 std r7, 0x70(r6)
520 std r8, 0x78(r6)
521 stdu r0, 0x80(r6)
522
523 bdnz L(loop2_128)
524 L(endloop2_128):
525
526 b L(lessthancacheline)
527
528
529 END_GEN_TB (MEMCPY,TB_TOCLESS)
530 libc_hidden_builtin_def (memcpy)