1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2016 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
23 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
27 .tc __cache_line_size[TC],__cache_line_size
36 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
37 cmpldi cr1,r5,16 /* is size < 16 ? */
38 mr r6,r3 /* Copy dest reg to r6; */
42 /* Big copy (16 bytes or more)
44 Figure out how far to the nearest quadword boundary, or if we are
45 on one already. Also get the cache line size.
47 r3 - return value (always)
48 r4 - current source addr
50 r6 - current dest addr
53 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
54 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
55 clrldi r8,r8,64-4 /* align to 16byte boundary */
56 sub r7,r4,r3 /* compute offset to src from dest */
57 lwz r9,0(r9) /* Get cache line size (part 2) */
58 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
59 addi r10,r9,-1 /* Cache line mask */
64 /* Destination is not aligned on quadword boundary. Get us to one.
66 r3 - return value (always)
67 r4 - current source addr
69 r6 - current dest addr
70 r7 - offset to src from dest
71 r8 - number of bytes to quadword boundary
74 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
75 subf r5,r8,r5 /* adjust remaining len */
78 lbzx r0,r7,r6 /* copy 1 byte addr */
83 lhzx r0,r7,r6 /* copy 2 byte addr */
88 lwzx r0,r7,r6 /* copy 4 byte addr */
93 ldx r0,r7,r6 /* copy 8 byte addr */
97 add r4,r7,r6 /* update src addr */
101 /* Dest is quadword aligned now.
103 Lots of decisions to make. If we are copying less than a cache
104 line we won't be here long. If we are not on a cache line
105 boundary we need to get there. And then we need to figure out
106 how many cache lines ahead to pre-touch.
108 r3 - return value (always)
109 r4 - current source addr
111 r6 - current dest addr
117 cmpdi cr0,r9,0 /* Cache line size set? */
118 bne+ cr0,L(cachelineset)
120 /* __cache_line_size not set: generic byte copy without much optimization */
121 clrldi. r0,r5,63 /* If length is odd copy one byte */
122 beq L(cachelinenotset_align)
123 lbz r7,0(r4) /* Read one byte from source */
124 addi r5,r5,-1 /* Update length */
125 addi r4,r4,1 /* Update source pointer address */
126 stb r7,0(r6) /* Store one byte at dest */
127 addi r6,r6,1 /* Update dest pointer address */
128 L(cachelinenotset_align):
129 cmpdi cr7,r5,0 /* If length is 0 return */
131 ori r2,r2,0 /* Force a new dispatch group */
132 L(cachelinenotset_loop):
133 addic. r5,r5,-2 /* Update length */
134 lbz r7,0(r4) /* Load 2 bytes from source */
136 addi r4,r4,2 /* Update source pointer address */
137 stb r7,0(r6) /* Store 2 bytes on dest */
139 addi r6,r6,2 /* Update dest pointer address */
140 bne L(cachelinenotset_loop)
145 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
147 neg r7,r6 /* How far to next cacheline bdy? */
149 addi r6,r6,-8 /* prepare for stdu */
151 addi r4,r4,-8 /* prepare for ldu */
154 ble+ cr5,L(lessthancacheline)
156 beq- cr0,L(big_lines) /* 128 byte line code */
160 /* More than a cacheline left to go, and using 64 byte cachelines */
162 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
164 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
166 /* Reduce total len by what it takes to get to the next cache line */
168 srdi r7,r7,4 /* How many qws to get to the line bdy? */
170 /* How many full cache lines to copy after getting to a line bdy? */
173 cmpldi r10,0 /* If no full cache lines to copy ... */
174 li r11,0 /* number cachelines to copy with prefetch */
175 beq L(nocacheprefetch)
178 /* We are here because we have at least one full cache line to copy,
179 and therefore some pre-touching to do. */
181 cmpldi r10,PREFETCH_AHEAD
182 li r12,64+8 /* prefetch distance */
183 ble L(lessthanmaxprefetch)
185 /* We can only do so much pre-fetching. R11 will have the count of
186 lines left to prefetch after the initial batch of prefetches
189 subi r11,r10,PREFETCH_AHEAD
190 li r10,PREFETCH_AHEAD
192 L(lessthanmaxprefetch):
195 /* At this point r10/ctr hold the number of lines to prefetch in this
196 initial batch, and r11 holds any remainder. */
204 /* Prefetching is done, or was not needed.
206 cr6 - are we on a cacheline boundary already?
207 r7 - number of quadwords to the next cacheline boundary
213 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
215 /* How many bytes are left after we copy whatever full
216 cache lines we can get? */
219 beq cr6,L(cachelinealigned)
222 /* Copy quadwords up to the next cacheline boundary */
230 bdnz L(aligntocacheline)
234 L(cachelinealigned): /* copy while cache lines */
236 blt- cr1,L(lessthancacheline) /* size <64 */
243 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
246 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
247 L(loop): /* Copy aligned body */
248 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
276 L(loop2): /* Copy aligned body */
300 L(lessthancacheline): /* Was there less than cache to do ? */
302 srdi r7,r5,4 /* divide size by 16 */
312 bdnz L(copy_remaining)
314 L(do_lt16): /* less than 16 ? */
315 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
316 beqlr+ /* no rest to copy */
320 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
324 ldx r0,r7,r6 /* copy 8 byte */
329 lwzx r0,r7,r6 /* copy 4 byte */
334 lhzx r0,r7,r6 /* copy 2 byte */
339 lbzx r0,r7,r6 /* copy 1 byte */
348 /* Similar to above, but for use with 128 byte lines. */
353 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
355 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
357 /* Reduce total len by what it takes to get to the next cache line */
359 srdi r7,r7,4 /* How many qws to get to the line bdy? */
361 /* How many full cache lines to copy after getting to a line bdy? */
364 cmpldi r10,0 /* If no full cache lines to copy ... */
365 li r11,0 /* number cachelines to copy with prefetch */
366 beq L(nocacheprefetch_128)
369 /* We are here because we have at least one full cache line to copy,
370 and therefore some pre-touching to do. */
372 cmpldi r10,PREFETCH_AHEAD
373 li r12,128+8 /* prefetch distance */
374 ble L(lessthanmaxprefetch_128)
376 /* We can only do so much pre-fetching. R11 will have the count of
377 lines left to prefetch after the initial batch of prefetches
380 subi r11,r10,PREFETCH_AHEAD
381 li r10,PREFETCH_AHEAD
383 L(lessthanmaxprefetch_128):
386 /* At this point r10/ctr hold the number of lines to prefetch in this
387 initial batch, and r11 holds any remainder. */
392 bdnz L(prefetchSRC_128)
395 /* Prefetching is done, or was not needed.
397 cr6 - are we on a cacheline boundary already?
398 r7 - number of quadwords to the next cacheline boundary
401 L(nocacheprefetch_128):
404 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
406 /* How many bytes are left after we copy whatever full
407 cache lines we can get? */
410 beq cr6,L(cachelinealigned_128)
413 /* Copy quadwords up to the next cacheline boundary */
415 L(aligntocacheline_128):
421 bdnz L(aligntocacheline_128)
424 L(cachelinealigned_128): /* copy while cache lines */
426 blt- cr1,L(lessthancacheline) /* size <128 */
433 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
436 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
437 L(loop_128): /* Copy aligned body */
438 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
482 L(loop2_128): /* Copy aligned body */
520 b L(lessthancacheline)
523 END_GEN_TB (memcpy,TB_TOCLESS)
524 libc_hidden_builtin_def (memcpy)