]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/cell/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / cell / memcpy.S
CommitLineData
057edf90 1/* Optimized memcpy implementation for CELL BE PowerPC.
b168057a 2 Copyright (C) 2010-2015 Free Software Foundation, Inc.
057edf90
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
057edf90
UD
18
19#include <sysdep.h>
057edf90
UD
20
21#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
22#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
23
24/* memcpy routine optimized for CELL-BE-PPC v2.0
25 *
26 * The CELL PPC core has 1 integer unit and 1 load/store unit
27 * CELL:
28 * 1st level data cache = 32K
29 * 2nd level data cache = 512K
30 * 3rd level data cache = 0K
31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32 * latency to memory is >400 clocks
33 * To improve copy performance we need to prefetch source data
34 * far ahead to hide this latency
2ccdea26 35 * For best performance instruction forms ending in "." like "andi."
057edf90
UD
36 * should be avoided as the are implemented in microcode on CELL.
37 * The below code is loop unrolled for the CELL cache line of 128 bytes
38 */
39
40.align 7
41
2d67d91a 42EALIGN (memcpy, 5, 0)
d6ac9329 43 CALL_MCOUNT 3
057edf90
UD
44
45 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
46 cmpldi cr1,r5,16 /* is size < 16 ? */
d6ac9329 47 mr r6,r3
057edf90
UD
48 blt+ cr1,.Lshortcopy
49
50.Lbigcopy:
51 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
c0c3f78a 52 clrldi r8,r8,64-4 /* align to 16byte boundary */
057edf90
UD
53 sub r7,r4,r3
54 cmpldi cr0,r8,0
55 beq+ .Ldst_aligned
56
57.Ldst_unaligned:
58 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
59 subf r5,r8,r5
60
61 bf cr7*4+3,1f
62 lbzx r0,r7,r6 /* copy 1 byte */
63 stb r0,0(r6)
64 addi r6,r6,1
651: bf cr7*4+2,2f
66 lhzx r0,r7,r6 /* copy 2 byte */
67 sth r0,0(r6)
68 addi r6,r6,2
692: bf cr7*4+1,4f
70 lwzx r0,r7,r6 /* copy 4 byte */
71 stw r0,0(r6)
72 addi r6,r6,4
734: bf cr7*4+0,8f
74 ldx r0,r7,r6 /* copy 8 byte */
75 std r0,0(r6)
76 addi r6,r6,8
778:
78 add r4,r7,r6
79
80.Ldst_aligned:
81
82 cmpdi cr5,r5,128-1
83
84 neg r7,r6
85 addi r6,r6,-8 /* prepare for stdu */
86 addi r4,r4,-8 /* prepare for ldu */
87
88 clrldi r7,r7,64-7 /* align to cacheline boundary */
89 ble+ cr5,.Llessthancacheline
90
91 cmpldi cr6,r7,0
92 subf r5,r7,r5
93 srdi r7,r7,4 /* divide size by 16 */
94 srdi r10,r5,7 /* number of cache lines to copy */
95
96 cmpldi r10,0
97 li r11,0 /* number cachelines to copy with prefetch */
98 beq .Lnocacheprefetch
99
100 cmpldi r10,PREFETCH_AHEAD
101 li r12,128+8 /* prefetch distance */
102 ble .Llessthanmaxprefetch
103
104 subi r11,r10,PREFETCH_AHEAD
105 li r10,PREFETCH_AHEAD
106
107.Llessthanmaxprefetch:
108 mtctr r10
109
110.LprefetchSRC:
111 dcbt r12,r4
d6ac9329
UD
112 addi r12,r12,128
113 bdnz .LprefetchSRC
057edf90
UD
114
115.Lnocacheprefetch:
116 mtctr r7
117 cmpldi cr1,r5,128
118 clrldi r5,r5,64-7
119 beq cr6,.Lcachelinealigned
120
121.Laligntocacheline:
d6ac9329 122 ld r9,0x08(r4)
057edf90
UD
123 ldu r7,0x10(r4)
124 std r9,0x08(r6)
125 stdu r7,0x10(r6)
126 bdnz .Laligntocacheline
127
128
129.Lcachelinealigned: /* copy while cache lines */
130
d6ac9329 131 blt- cr1,.Llessthancacheline /* size <128 */
057edf90
UD
132
133.Louterloop:
d6ac9329 134 cmpdi r11,0
057edf90
UD
135 mtctr r11
136 beq- .Lendloop
137
138 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
139
140.align 4
141 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
d6ac9329 142.Lloop: /* Copy aligned body */
057edf90
UD
143 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
144 ld r9, 0x08(r4)
145 dcbz r11,r6
146 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
2ccdea26 147 ld r8, 0x18(r4) /* to hide 1st level cache latency. */
057edf90
UD
148 ld r0, 0x20(r4)
149 std r9, 0x08(r6)
150 std r7, 0x10(r6)
151 std r8, 0x18(r6)
152 std r0, 0x20(r6)
153 ld r9, 0x28(r4)
154 ld r7, 0x30(r4)
155 ld r8, 0x38(r4)
156 ld r0, 0x40(r4)
157 std r9, 0x28(r6)
158 std r7, 0x30(r6)
159 std r8, 0x38(r6)
160 std r0, 0x40(r6)
161 ld r9, 0x48(r4)
162 ld r7, 0x50(r4)
163 ld r8, 0x58(r4)
164 ld r0, 0x60(r4)
165 std r9, 0x48(r6)
166 std r7, 0x50(r6)
167 std r8, 0x58(r6)
168 std r0, 0x60(r6)
169 ld r9, 0x68(r4)
170 ld r7, 0x70(r4)
171 ld r8, 0x78(r4)
172 ldu r0, 0x80(r4)
173 std r9, 0x68(r6)
174 std r7, 0x70(r6)
175 std r8, 0x78(r6)
176 stdu r0, 0x80(r6)
177
178 bdnz .Lloop
179
180.Lendloop:
181 cmpdi r10,0
182 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
183 beq- .Lendloop2
184 mtctr r10
185
d6ac9329 186.Lloop2: /* Copy aligned body */
057edf90
UD
187 ld r9, 0x08(r4)
188 ld r7, 0x10(r4)
189 ld r8, 0x18(r4)
190 ldu r0, 0x20(r4)
191 std r9, 0x08(r6)
192 std r7, 0x10(r6)
193 std r8, 0x18(r6)
194 stdu r0, 0x20(r6)
195
196 bdnz .Lloop2
197.Lendloop2:
198
199.Llessthancacheline: /* less than cache to do ? */
200 cmpldi cr0,r5,16
201 srdi r7,r5,4 /* divide size by 16 */
202 blt- .Ldo_lt16
203 mtctr r7
204
205.Lcopy_remaining:
d6ac9329 206 ld r8,0x08(r4)
057edf90
UD
207 ldu r7,0x10(r4)
208 std r8,0x08(r6)
209 stdu r7,0x10(r6)
210 bdnz .Lcopy_remaining
211
212.Ldo_lt16: /* less than 16 ? */
213 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
d6ac9329 214 beqlr+ /* no rest to copy */
057edf90
UD
215 addi r4,r4,8
216 addi r6,r6,8
217
218.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
219 mtcrf 0x01,r5
220 sub r7,r4,r6
221 bf- cr7*4+0,8f
222 ldx r0,r7,r6 /* copy 8 byte */
223 std r0,0(r6)
224 addi r6,r6,8
2258:
226 bf cr7*4+1,4f
227 lwzx r0,r7,r6 /* copy 4 byte */
228 stw r0,0(r6)
229 addi r6,r6,4
2304:
231 bf cr7*4+2,2f
232 lhzx r0,r7,r6 /* copy 2 byte */
233 sth r0,0(r6)
234 addi r6,r6,2
2352:
236 bf cr7*4+3,1f
237 lbzx r0,r7,r6 /* copy 1 byte */
238 stb r0,0(r6)
2391: blr
240
2d67d91a 241END_GEN_TB (memcpy,TB_TOCLESS)
057edf90 242libc_hidden_builtin_def (memcpy)