1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
23 # define MEMCPY memcpy
26 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
27 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
31 .tc __cache_line_size[TC],__cache_line_size
40 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
41 cmpldi cr1,r5,16 /* is size < 16 ? */
42 mr r6,r3 /* Copy dest reg to r6; */
46 /* Big copy (16 bytes or more)
48 Figure out how far to the nearest quadword boundary, or if we are
49 on one already. Also get the cache line size.
51 r3 - return value (always)
52 r4 - current source addr
54 r6 - current dest addr
57 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
58 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
59 clrldi r8,r8,64-4 /* align to 16byte boundary */
60 sub r7,r4,r3 /* compute offset to src from dest */
61 lwz r9,0(r9) /* Get cache line size (part 2) */
62 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
63 addi r10,r9,-1 /* Cache line mask */
68 /* Destination is not aligned on quadword boundary. Get us to one.
70 r3 - return value (always)
71 r4 - current source addr
73 r6 - current dest addr
74 r7 - offset to src from dest
75 r8 - number of bytes to quadword boundary
78 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
79 subf r5,r8,r5 /* adjust remaining len */
82 lbzx r0,r7,r6 /* copy 1 byte addr */
87 lhzx r0,r7,r6 /* copy 2 byte addr */
92 lwzx r0,r7,r6 /* copy 4 byte addr */
97 ldx r0,r7,r6 /* copy 8 byte addr */
101 add r4,r7,r6 /* update src addr */
105 /* Dest is quadword aligned now.
107 Lots of decisions to make. If we are copying less than a cache
108 line we won't be here long. If we are not on a cache line
109 boundary we need to get there. And then we need to figure out
110 how many cache lines ahead to pre-touch.
112 r3 - return value (always)
113 r4 - current source addr
115 r6 - current dest addr
121 cmpdi cr0,r9,0 /* Cache line size set? */
122 bne+ cr0,L(cachelineset)
124 /* __cache_line_size not set: generic byte copy without much optimization */
125 clrldi. r0,r5,63 /* If length is odd copy one byte */
126 beq L(cachelinenotset_align)
127 lbz r7,0(r4) /* Read one byte from source */
128 addi r5,r5,-1 /* Update length */
129 addi r4,r4,1 /* Update source pointer address */
130 stb r7,0(r6) /* Store one byte at dest */
131 addi r6,r6,1 /* Update dest pointer address */
132 L(cachelinenotset_align):
133 cmpdi cr7,r5,0 /* If length is 0 return */
135 ori r2,r2,0 /* Force a new dispatch group */
136 L(cachelinenotset_loop):
137 addic. r5,r5,-2 /* Update length */
138 lbz r7,0(r4) /* Load 2 bytes from source */
140 addi r4,r4,2 /* Update source pointer address */
141 stb r7,0(r6) /* Store 2 bytes on dest */
143 addi r6,r6,2 /* Update dest pointer address */
144 bne L(cachelinenotset_loop)
149 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
151 neg r7,r6 /* How far to next cacheline bdy? */
153 addi r6,r6,-8 /* prepare for stdu */
155 addi r4,r4,-8 /* prepare for ldu */
158 ble+ cr5,L(lessthancacheline)
160 beq- cr0,L(big_lines) /* 128 byte line code */
164 /* More than a cacheline left to go, and using 64 byte cachelines */
166 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
168 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
170 /* Reduce total len by what it takes to get to the next cache line */
172 srdi r7,r7,4 /* How many qws to get to the line bdy? */
174 /* How many full cache lines to copy after getting to a line bdy? */
177 cmpldi r10,0 /* If no full cache lines to copy ... */
178 li r11,0 /* number cachelines to copy with prefetch */
179 beq L(nocacheprefetch)
182 /* We are here because we have at least one full cache line to copy,
183 and therefore some pre-touching to do. */
185 cmpldi r10,PREFETCH_AHEAD
186 li r12,64+8 /* prefetch distance */
187 ble L(lessthanmaxprefetch)
189 /* We can only do so much pre-fetching. R11 will have the count of
190 lines left to prefetch after the initial batch of prefetches
193 subi r11,r10,PREFETCH_AHEAD
194 li r10,PREFETCH_AHEAD
196 L(lessthanmaxprefetch):
199 /* At this point r10/ctr hold the number of lines to prefetch in this
200 initial batch, and r11 holds any remainder. */
208 /* Prefetching is done, or was not needed.
210 cr6 - are we on a cacheline boundary already?
211 r7 - number of quadwords to the next cacheline boundary
217 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
219 /* How many bytes are left after we copy whatever full
220 cache lines we can get? */
223 beq cr6,L(cachelinealigned)
226 /* Copy quadwords up to the next cacheline boundary */
234 bdnz L(aligntocacheline)
238 L(cachelinealigned): /* copy while cache lines */
240 blt- cr1,L(lessthancacheline) /* size <64 */
247 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
250 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
251 L(loop): /* Copy aligned body */
252 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
280 L(loop2): /* Copy aligned body */
304 L(lessthancacheline): /* Was there less than cache to do ? */
306 srdi r7,r5,4 /* divide size by 16 */
316 bdnz L(copy_remaining)
318 L(do_lt16): /* less than 16 ? */
319 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
320 beqlr+ /* no rest to copy */
324 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
328 ldx r0,r7,r6 /* copy 8 byte */
333 lwzx r0,r7,r6 /* copy 4 byte */
338 lhzx r0,r7,r6 /* copy 2 byte */
343 lbzx r0,r7,r6 /* copy 1 byte */
352 /* Similar to above, but for use with 128 byte lines. */
357 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
359 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
361 /* Reduce total len by what it takes to get to the next cache line */
363 srdi r7,r7,4 /* How many qws to get to the line bdy? */
365 /* How many full cache lines to copy after getting to a line bdy? */
368 cmpldi r10,0 /* If no full cache lines to copy ... */
369 li r11,0 /* number cachelines to copy with prefetch */
370 beq L(nocacheprefetch_128)
373 /* We are here because we have at least one full cache line to copy,
374 and therefore some pre-touching to do. */
376 cmpldi r10,PREFETCH_AHEAD
377 li r12,128+8 /* prefetch distance */
378 ble L(lessthanmaxprefetch_128)
380 /* We can only do so much pre-fetching. R11 will have the count of
381 lines left to prefetch after the initial batch of prefetches
384 subi r11,r10,PREFETCH_AHEAD
385 li r10,PREFETCH_AHEAD
387 L(lessthanmaxprefetch_128):
390 /* At this point r10/ctr hold the number of lines to prefetch in this
391 initial batch, and r11 holds any remainder. */
396 bdnz L(prefetchSRC_128)
399 /* Prefetching is done, or was not needed.
401 cr6 - are we on a cacheline boundary already?
402 r7 - number of quadwords to the next cacheline boundary
405 L(nocacheprefetch_128):
408 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
410 /* How many bytes are left after we copy whatever full
411 cache lines we can get? */
414 beq cr6,L(cachelinealigned_128)
417 /* Copy quadwords up to the next cacheline boundary */
419 L(aligntocacheline_128):
425 bdnz L(aligntocacheline_128)
428 L(cachelinealigned_128): /* copy while cache lines */
430 blt- cr1,L(lessthancacheline) /* size <128 */
437 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
440 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
441 L(loop_128): /* Copy aligned body */
442 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
486 L(loop2_128): /* Copy aligned body */
524 b L(lessthancacheline)
527 END_GEN_TB (MEMCPY,TB_TOCLESS)
528 libc_hidden_builtin_def (memcpy)