]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/a2/memcpy.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / a2 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #ifndef MEMCPY
23 # define MEMCPY memcpy
24 #endif
25
26 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
27 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
28
29 .section ".toc","aw"
30 .LC0:
31 .tc __cache_line_size[TC],__cache_line_size
32 .section ".text"
33 .align 2
34
35
36 .machine a2
37 ENTRY (MEMCPY, 5)
38 CALL_MCOUNT 3
39
40 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
41 cmpldi cr1,r5,16 /* is size < 16 ? */
42 mr r6,r3 /* Copy dest reg to r6; */
43 blt+ cr1,L(shortcopy)
44
45
46 /* Big copy (16 bytes or more)
47
48 Figure out how far to the nearest quadword boundary, or if we are
49 on one already. Also get the cache line size.
50
51 r3 - return value (always)
52 r4 - current source addr
53 r5 - copy length
54 r6 - current dest addr
55 */
56
57 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
58 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
59 clrldi r8,r8,64-4 /* align to 16byte boundary */
60 sub r7,r4,r3 /* compute offset to src from dest */
61 lwz r9,0(r9) /* Get cache line size (part 2) */
62 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
63 addi r10,r9,-1 /* Cache line mask */
64 beq+ L(dst_aligned)
65
66
67
68 /* Destination is not aligned on quadword boundary. Get us to one.
69
70 r3 - return value (always)
71 r4 - current source addr
72 r5 - copy length
73 r6 - current dest addr
74 r7 - offset to src from dest
75 r8 - number of bytes to quadword boundary
76 */
77
78 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
79 subf r5,r8,r5 /* adjust remaining len */
80
81 bf cr7*4+3,1f
82 lbzx r0,r7,r6 /* copy 1 byte addr */
83 stb r0,0(r6)
84 addi r6,r6,1
85 1:
86 bf cr7*4+2,2f
87 lhzx r0,r7,r6 /* copy 2 byte addr */
88 sth r0,0(r6)
89 addi r6,r6,2
90 2:
91 bf cr7*4+1,4f
92 lwzx r0,r7,r6 /* copy 4 byte addr */
93 stw r0,0(r6)
94 addi r6,r6,4
95 4:
96 bf cr7*4+0,8f
97 ldx r0,r7,r6 /* copy 8 byte addr */
98 std r0,0(r6)
99 addi r6,r6,8
100 8:
101 add r4,r7,r6 /* update src addr */
102
103
104
105 /* Dest is quadword aligned now.
106
107 Lots of decisions to make. If we are copying less than a cache
108 line we won't be here long. If we are not on a cache line
109 boundary we need to get there. And then we need to figure out
110 how many cache lines ahead to pre-touch.
111
112 r3 - return value (always)
113 r4 - current source addr
114 r5 - copy length
115 r6 - current dest addr
116 */
117
118
119 .align 4
120 L(dst_aligned):
121 cmpdi cr0,r9,0 /* Cache line size set? */
122 bne+ cr0,L(cachelineset)
123
124 /* __cache_line_size not set: generic byte copy without much optimization */
125 clrldi. r0,r5,63 /* If length is odd copy one byte */
126 beq L(cachelinenotset_align)
127 lbz r7,0(r4) /* Read one byte from source */
128 addi r5,r5,-1 /* Update length */
129 addi r4,r4,1 /* Update source pointer address */
130 stb r7,0(r6) /* Store one byte at dest */
131 addi r6,r6,1 /* Update dest pointer address */
132 L(cachelinenotset_align):
133 cmpdi cr7,r5,0 /* If length is 0 return */
134 beqlr cr7
135 ori r2,r2,0 /* Force a new dispatch group */
136 L(cachelinenotset_loop):
137 addic. r5,r5,-2 /* Update length */
138 lbz r7,0(r4) /* Load 2 bytes from source */
139 lbz r8,1(r4)
140 addi r4,r4,2 /* Update source pointer address */
141 stb r7,0(r6) /* Store 2 bytes on dest */
142 stb r8,1(r6)
143 addi r6,r6,2 /* Update dest pointer address */
144 bne L(cachelinenotset_loop)
145 blr
146
147
148 L(cachelineset):
149 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
150
151 neg r7,r6 /* How far to next cacheline bdy? */
152
153 addi r6,r6,-8 /* prepare for stdu */
154 cmpdi cr0,r9,128
155 addi r4,r4,-8 /* prepare for ldu */
156
157
158 ble+ cr5,L(lessthancacheline)
159
160 beq- cr0,L(big_lines) /* 128 byte line code */
161
162
163
164 /* More than a cacheline left to go, and using 64 byte cachelines */
165
166 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
167
168 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
169
170 /* Reduce total len by what it takes to get to the next cache line */
171 subf r5,r7,r5
172 srdi r7,r7,4 /* How many qws to get to the line bdy? */
173
174 /* How many full cache lines to copy after getting to a line bdy? */
175 srdi r10,r5,6
176
177 cmpldi r10,0 /* If no full cache lines to copy ... */
178 li r11,0 /* number cachelines to copy with prefetch */
179 beq L(nocacheprefetch)
180
181
182 /* We are here because we have at least one full cache line to copy,
183 and therefore some pre-touching to do. */
184
185 cmpldi r10,PREFETCH_AHEAD
186 li r12,64+8 /* prefetch distance */
187 ble L(lessthanmaxprefetch)
188
189 /* We can only do so much pre-fetching. R11 will have the count of
190 lines left to prefetch after the initial batch of prefetches
191 are executed. */
192
193 subi r11,r10,PREFETCH_AHEAD
194 li r10,PREFETCH_AHEAD
195
196 L(lessthanmaxprefetch):
197 mtctr r10
198
199 /* At this point r10/ctr hold the number of lines to prefetch in this
200 initial batch, and r11 holds any remainder. */
201
202 L(prefetchSRC):
203 dcbt r12,r4
204 addi r12,r12,64
205 bdnz L(prefetchSRC)
206
207
208 /* Prefetching is done, or was not needed.
209
210 cr6 - are we on a cacheline boundary already?
211 r7 - number of quadwords to the next cacheline boundary
212 */
213
214 L(nocacheprefetch):
215 mtctr r7
216
217 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
218
219 /* How many bytes are left after we copy whatever full
220 cache lines we can get? */
221 clrldi r5,r5,64-6
222
223 beq cr6,L(cachelinealigned)
224
225
226 /* Copy quadwords up to the next cacheline boundary */
227
228 L(aligntocacheline):
229 ld r9,0x08(r4)
230 ld r7,0x10(r4)
231 addi r4,r4,0x10
232 std r9,0x08(r6)
233 stdu r7,0x10(r6)
234 bdnz L(aligntocacheline)
235
236
237 .align 4
238 L(cachelinealigned): /* copy while cache lines */
239
240 blt- cr1,L(lessthancacheline) /* size <64 */
241
242 L(outerloop):
243 cmpdi r11,0
244 mtctr r11
245 beq- L(endloop)
246
247 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
248
249 .align 4
250 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
251 L(loop): /* Copy aligned body */
252 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
253 ld r9, 0x08(r4)
254 dcbz r11,r6
255 ld r7, 0x10(r4)
256 ld r8, 0x18(r4)
257 ld r0, 0x20(r4)
258 std r9, 0x08(r6)
259 std r7, 0x10(r6)
260 std r8, 0x18(r6)
261 std r0, 0x20(r6)
262 ld r9, 0x28(r4)
263 ld r7, 0x30(r4)
264 ld r8, 0x38(r4)
265 ld r0, 0x40(r4)
266 addi r4, r4,0x40
267 std r9, 0x28(r6)
268 std r7, 0x30(r6)
269 std r8, 0x38(r6)
270 stdu r0, 0x40(r6)
271
272 bdnz L(loop)
273
274
275 L(endloop):
276 cmpdi r10,0
277 beq- L(endloop2)
278 mtctr r10
279
280 L(loop2): /* Copy aligned body */
281 ld r9, 0x08(r4)
282 ld r7, 0x10(r4)
283 ld r8, 0x18(r4)
284 ld r0, 0x20(r4)
285 std r9, 0x08(r6)
286 std r7, 0x10(r6)
287 std r8, 0x18(r6)
288 std r0, 0x20(r6)
289 ld r9, 0x28(r4)
290 ld r7, 0x30(r4)
291 ld r8, 0x38(r4)
292 ld r0, 0x40(r4)
293 addi r4, r4,0x40
294 std r9, 0x28(r6)
295 std r7, 0x30(r6)
296 std r8, 0x38(r6)
297 stdu r0, 0x40(r6)
298
299 bdnz L(loop2)
300 L(endloop2):
301
302
303 .align 4
304 L(lessthancacheline): /* Was there less than cache to do ? */
305 cmpldi cr0,r5,16
306 srdi r7,r5,4 /* divide size by 16 */
307 blt- L(do_lt16)
308 mtctr r7
309
310 L(copy_remaining):
311 ld r8,0x08(r4)
312 ld r7,0x10(r4)
313 addi r4,r4,0x10
314 std r8,0x08(r6)
315 stdu r7,0x10(r6)
316 bdnz L(copy_remaining)
317
318 L(do_lt16): /* less than 16 ? */
319 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
320 beqlr+ /* no rest to copy */
321 addi r4,r4,8
322 addi r6,r6,8
323
324 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
325 mtcrf 0x01,r5
326 sub r7,r4,r6
327 bf- cr7*4+0,8f
328 ldx r0,r7,r6 /* copy 8 byte */
329 std r0,0(r6)
330 addi r6,r6,8
331 8:
332 bf cr7*4+1,4f
333 lwzx r0,r7,r6 /* copy 4 byte */
334 stw r0,0(r6)
335 addi r6,r6,4
336 4:
337 bf cr7*4+2,2f
338 lhzx r0,r7,r6 /* copy 2 byte */
339 sth r0,0(r6)
340 addi r6,r6,2
341 2:
342 bf cr7*4+3,1f
343 lbzx r0,r7,r6 /* copy 1 byte */
344 stb r0,0(r6)
345 1:
346 blr
347
348
349
350
351
352 /* Similar to above, but for use with 128 byte lines. */
353
354
355 L(big_lines):
356
357 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
358
359 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
360
361 /* Reduce total len by what it takes to get to the next cache line */
362 subf r5,r7,r5
363 srdi r7,r7,4 /* How many qws to get to the line bdy? */
364
365 /* How many full cache lines to copy after getting to a line bdy? */
366 srdi r10,r5,7
367
368 cmpldi r10,0 /* If no full cache lines to copy ... */
369 li r11,0 /* number cachelines to copy with prefetch */
370 beq L(nocacheprefetch_128)
371
372
373 /* We are here because we have at least one full cache line to copy,
374 and therefore some pre-touching to do. */
375
376 cmpldi r10,PREFETCH_AHEAD
377 li r12,128+8 /* prefetch distance */
378 ble L(lessthanmaxprefetch_128)
379
380 /* We can only do so much pre-fetching. R11 will have the count of
381 lines left to prefetch after the initial batch of prefetches
382 are executed. */
383
384 subi r11,r10,PREFETCH_AHEAD
385 li r10,PREFETCH_AHEAD
386
387 L(lessthanmaxprefetch_128):
388 mtctr r10
389
390 /* At this point r10/ctr hold the number of lines to prefetch in this
391 initial batch, and r11 holds any remainder. */
392
393 L(prefetchSRC_128):
394 dcbt r12,r4
395 addi r12,r12,128
396 bdnz L(prefetchSRC_128)
397
398
399 /* Prefetching is done, or was not needed.
400
401 cr6 - are we on a cacheline boundary already?
402 r7 - number of quadwords to the next cacheline boundary
403 */
404
405 L(nocacheprefetch_128):
406 mtctr r7
407
408 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
409
410 /* How many bytes are left after we copy whatever full
411 cache lines we can get? */
412 clrldi r5,r5,64-7
413
414 beq cr6,L(cachelinealigned_128)
415
416
417 /* Copy quadwords up to the next cacheline boundary */
418
419 L(aligntocacheline_128):
420 ld r9,0x08(r4)
421 ld r7,0x10(r4)
422 addi r4,r4,0x10
423 std r9,0x08(r6)
424 stdu r7,0x10(r6)
425 bdnz L(aligntocacheline_128)
426
427
428 L(cachelinealigned_128): /* copy while cache lines */
429
430 blt- cr1,L(lessthancacheline) /* size <128 */
431
432 L(outerloop_128):
433 cmpdi r11,0
434 mtctr r11
435 beq- L(endloop_128)
436
437 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
438
439 .align 4
440 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
441 L(loop_128): /* Copy aligned body */
442 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
443 ld r9, 0x08(r4)
444 dcbz r11,r6
445 ld r7, 0x10(r4)
446 ld r8, 0x18(r4)
447 ld r0, 0x20(r4)
448 std r9, 0x08(r6)
449 std r7, 0x10(r6)
450 std r8, 0x18(r6)
451 std r0, 0x20(r6)
452 ld r9, 0x28(r4)
453 ld r7, 0x30(r4)
454 ld r8, 0x38(r4)
455 ld r0, 0x40(r4)
456 std r9, 0x28(r6)
457 std r7, 0x30(r6)
458 std r8, 0x38(r6)
459 std r0, 0x40(r6)
460 ld r9, 0x48(r4)
461 ld r7, 0x50(r4)
462 ld r8, 0x58(r4)
463 ld r0, 0x60(r4)
464 std r9, 0x48(r6)
465 std r7, 0x50(r6)
466 std r8, 0x58(r6)
467 std r0, 0x60(r6)
468 ld r9, 0x68(r4)
469 ld r7, 0x70(r4)
470 ld r8, 0x78(r4)
471 ld r0, 0x80(r4)
472 addi r4, r4,0x80
473 std r9, 0x68(r6)
474 std r7, 0x70(r6)
475 std r8, 0x78(r6)
476 stdu r0, 0x80(r6)
477
478 bdnz L(loop_128)
479
480
481 L(endloop_128):
482 cmpdi r10,0
483 beq- L(endloop2_128)
484 mtctr r10
485
486 L(loop2_128): /* Copy aligned body */
487 ld r9, 0x08(r4)
488 ld r7, 0x10(r4)
489 ld r8, 0x18(r4)
490 ld r0, 0x20(r4)
491 std r9, 0x08(r6)
492 std r7, 0x10(r6)
493 std r8, 0x18(r6)
494 std r0, 0x20(r6)
495 ld r9, 0x28(r4)
496 ld r7, 0x30(r4)
497 ld r8, 0x38(r4)
498 ld r0, 0x40(r4)
499 std r9, 0x28(r6)
500 std r7, 0x30(r6)
501 std r8, 0x38(r6)
502 std r0, 0x40(r6)
503 ld r9, 0x48(r4)
504 ld r7, 0x50(r4)
505 ld r8, 0x58(r4)
506 ld r0, 0x60(r4)
507 std r9, 0x48(r6)
508 std r7, 0x50(r6)
509 std r8, 0x58(r6)
510 std r0, 0x60(r6)
511 ld r9, 0x68(r4)
512 ld r7, 0x70(r4)
513 ld r8, 0x78(r4)
514 ld r0, 0x80(r4)
515 addi r4, r4,0x80
516 std r9, 0x68(r6)
517 std r7, 0x70(r6)
518 std r8, 0x78(r6)
519 stdu r0, 0x80(r6)
520
521 bdnz L(loop2_128)
522 L(endloop2_128):
523
524 b L(lessthancacheline)
525
526
527 END_GEN_TB (MEMCPY,TB_TOCLESS)
528 libc_hidden_builtin_def (memcpy)