1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
25 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
29 .tc __cache_line_size[TC],__cache_line_size
35 EALIGN (BP_SYM (memcpy), 5, 0)
38 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
39 cmpldi cr1,r5,16 /* is size < 16 ? */
40 mr r6,r3 /* Copy dest reg to r6; */
44 /* Big copy (16 bytes or more)
46 Figure out how far to the nearest quadword boundary, or if we are
47 on one already. Also get the cache line size.
49 r3 - return value (always)
50 r4 - current source addr
52 r6 - current dest addr
55 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
56 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
57 clrldi r8,r8,64-4 /* align to 16byte boundary */
58 sub r7,r4,r3 /* compute offset to src from dest */
59 lwz r9,0(r9) /* Get cache line size (part 2) */
60 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
61 addi r10,r9,-1 /* Cache line mask */
66 /* Destination is not aligned on quadword boundary. Get us to one.
68 r3 - return value (always)
69 r4 - current source addr
71 r6 - current dest addr
72 r7 - offset to src from dest
73 r8 - number of bytes to quadword boundary
76 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
77 subf r5,r8,r5 /* adjust remaining len */
80 lbzx r0,r7,r6 /* copy 1 byte addr */
85 lhzx r0,r7,r6 /* copy 2 byte addr */
90 lwzx r0,r7,r6 /* copy 4 byte addr */
95 ldx r0,r7,r6 /* copy 8 byte addr */
99 add r4,r7,r6 /* update src addr */
103 /* Dest is quadword aligned now.
105 Lots of decisions to make. If we are copying less than a cache
106 line we won't be here long. If we are not on a cache line
107 boundary we need to get there. And then we need to figure out
108 how many cache lines ahead to pre-touch.
110 r3 - return value (always)
111 r4 - current source addr
113 r6 - current dest addr
119 cmpdi cr0,r9,0 /* Cache line size set? */
120 bne+ cr0,L(cachelineset)
122 /* __cache_line_size not set: generic byte copy without much optimization */
123 clrldi. r0,r5,63 /* If length is odd copy one byte */
124 beq L(cachelinenotset_align)
125 lbz r7,0(r4) /* Read one byte from source */
126 addi r5,r5,-1 /* Update length */
127 addi r4,r4,1 /* Update source pointer address */
128 stb r7,0(r6) /* Store one byte at dest */
129 addi r6,r6,1 /* Update dest pointer address */
130 L(cachelinenotset_align):
131 cmpdi cr7,r5,0 /* If length is 0 return */
133 ori r2,r2,0 /* Force a new dispatch group */
134 L(cachelinenotset_loop):
135 addic. r5,r5,-2 /* Update length */
136 lbz r7,0(r4) /* Load 2 bytes from source */
138 addi r4,r4,2 /* Update source pointer address */
139 stb r7,0(r6) /* Store 2 bytes on dest */
141 addi r6,r6,2 /* Update dest pointer address */
142 bne L(cachelinenotset_loop)
147 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
149 neg r7,r6 /* How far to next cacheline bdy? */
151 addi r6,r6,-8 /* prepare for stdu */
153 addi r4,r4,-8 /* prepare for ldu */
156 ble+ cr5,L(lessthancacheline)
158 beq- cr0,L(big_lines) /* 128 byte line code */
162 /* More than a cacheline left to go, and using 64 byte cachelines */
164 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
166 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
168 /* Reduce total len by what it takes to get to the next cache line */
170 srdi r7,r7,4 /* How many qws to get to the line bdy? */
172 /* How many full cache lines to copy after getting to a line bdy? */
175 cmpldi r10,0 /* If no full cache lines to copy ... */
176 li r11,0 /* number cachelines to copy with prefetch */
177 beq L(nocacheprefetch)
180 /* We are here because we have at least one full cache line to copy,
181 and therefore some pre-touching to do. */
183 cmpldi r10,PREFETCH_AHEAD
184 li r12,64+8 /* prefetch distance */
185 ble L(lessthanmaxprefetch)
187 /* We can only do so much pre-fetching. R11 will have the count of
188 lines left to prefetch after the initial batch of prefetches
191 subi r11,r10,PREFETCH_AHEAD
192 li r10,PREFETCH_AHEAD
194 L(lessthanmaxprefetch):
197 /* At this point r10/ctr hold the number of lines to prefetch in this
198 initial batch, and r11 holds any remainder. */
206 /* Prefetching is done, or was not needed.
208 cr6 - are we on a cacheline boundary already?
209 r7 - number of quadwords to the next cacheline boundary
215 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
217 /* How many bytes are left after we copy whatever full
218 cache lines we can get? */
221 beq cr6,L(cachelinealigned)
224 /* Copy quadwords up to the next cacheline boundary */
232 bdnz L(aligntocacheline)
236 L(cachelinealigned): /* copy while cache lines */
238 blt- cr1,L(lessthancacheline) /* size <64 */
245 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
248 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
249 L(loop): /* Copy aligned body */
250 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
278 L(loop2): /* Copy aligned body */
302 L(lessthancacheline): /* Was there less than cache to do ? */
304 srdi r7,r5,4 /* divide size by 16 */
314 bdnz L(copy_remaining)
316 L(do_lt16): /* less than 16 ? */
317 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
318 beqlr+ /* no rest to copy */
322 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
326 ldx r0,r7,r6 /* copy 8 byte */
331 lwzx r0,r7,r6 /* copy 4 byte */
336 lhzx r0,r7,r6 /* copy 2 byte */
341 lbzx r0,r7,r6 /* copy 1 byte */
350 /* Similar to above, but for use with 128 byte lines. */
355 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
357 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
359 /* Reduce total len by what it takes to get to the next cache line */
361 srdi r7,r7,4 /* How many qws to get to the line bdy? */
363 /* How many full cache lines to copy after getting to a line bdy? */
366 cmpldi r10,0 /* If no full cache lines to copy ... */
367 li r11,0 /* number cachelines to copy with prefetch */
368 beq L(nocacheprefetch_128)
371 /* We are here because we have at least one full cache line to copy,
372 and therefore some pre-touching to do. */
374 cmpldi r10,PREFETCH_AHEAD
375 li r12,128+8 /* prefetch distance */
376 ble L(lessthanmaxprefetch_128)
378 /* We can only do so much pre-fetching. R11 will have the count of
379 lines left to prefetch after the initial batch of prefetches
382 subi r11,r10,PREFETCH_AHEAD
383 li r10,PREFETCH_AHEAD
385 L(lessthanmaxprefetch_128):
388 /* At this point r10/ctr hold the number of lines to prefetch in this
389 initial batch, and r11 holds any remainder. */
394 bdnz L(prefetchSRC_128)
397 /* Prefetching is done, or was not needed.
399 cr6 - are we on a cacheline boundary already?
400 r7 - number of quadwords to the next cacheline boundary
403 L(nocacheprefetch_128):
406 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
408 /* How many bytes are left after we copy whatever full
409 cache lines we can get? */
412 beq cr6,L(cachelinealigned_128)
415 /* Copy quadwords up to the next cacheline boundary */
417 L(aligntocacheline_128):
423 bdnz L(aligntocacheline_128)
426 L(cachelinealigned_128): /* copy while cache lines */
428 blt- cr1,L(lessthancacheline) /* size <128 */
435 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
438 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
439 L(loop_128): /* Copy aligned body */
440 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
484 L(loop2_128): /* Copy aligned body */
522 b L(lessthancacheline)
525 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
526 libc_hidden_builtin_def (memcpy)