]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/cell/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / cell / memcpy.S
CommitLineData
057edf90 1/* Optimized memcpy implementation for CELL BE PowerPC.
04277e02 2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
057edf90
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
057edf90
UD
18
19#include <sysdep.h>
057edf90 20
72fd128a
WSM
21#ifndef MEMCPY
22# define MEMCPY memcpy
23#endif
24
057edf90
UD
25#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
26#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
27
28/* memcpy routine optimized for CELL-BE-PPC v2.0
29 *
30 * The CELL PPC core has 1 integer unit and 1 load/store unit
31 * CELL:
32 * 1st level data cache = 32K
33 * 2nd level data cache = 512K
34 * 3rd level data cache = 0K
35 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
36 * latency to memory is >400 clocks
37 * To improve copy performance we need to prefetch source data
38 * far ahead to hide this latency
2ccdea26 39 * For best performance instruction forms ending in "." like "andi."
057edf90
UD
40 * should be avoided as the are implemented in microcode on CELL.
41 * The below code is loop unrolled for the CELL cache line of 128 bytes
42 */
43
44.align 7
45
d5b41185 46ENTRY_TOCLESS (MEMCPY, 5)
d6ac9329 47 CALL_MCOUNT 3
057edf90
UD
48
49 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
50 cmpldi cr1,r5,16 /* is size < 16 ? */
d6ac9329 51 mr r6,r3
057edf90
UD
52 blt+ cr1,.Lshortcopy
53
54.Lbigcopy:
55 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
c0c3f78a 56 clrldi r8,r8,64-4 /* align to 16byte boundary */
057edf90
UD
57 sub r7,r4,r3
58 cmpldi cr0,r8,0
59 beq+ .Ldst_aligned
60
61.Ldst_unaligned:
62 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
63 subf r5,r8,r5
64
65 bf cr7*4+3,1f
66 lbzx r0,r7,r6 /* copy 1 byte */
67 stb r0,0(r6)
68 addi r6,r6,1
691: bf cr7*4+2,2f
70 lhzx r0,r7,r6 /* copy 2 byte */
71 sth r0,0(r6)
72 addi r6,r6,2
732: bf cr7*4+1,4f
74 lwzx r0,r7,r6 /* copy 4 byte */
75 stw r0,0(r6)
76 addi r6,r6,4
774: bf cr7*4+0,8f
78 ldx r0,r7,r6 /* copy 8 byte */
79 std r0,0(r6)
80 addi r6,r6,8
818:
82 add r4,r7,r6
83
84.Ldst_aligned:
85
86 cmpdi cr5,r5,128-1
87
88 neg r7,r6
89 addi r6,r6,-8 /* prepare for stdu */
90 addi r4,r4,-8 /* prepare for ldu */
91
92 clrldi r7,r7,64-7 /* align to cacheline boundary */
93 ble+ cr5,.Llessthancacheline
94
95 cmpldi cr6,r7,0
96 subf r5,r7,r5
97 srdi r7,r7,4 /* divide size by 16 */
98 srdi r10,r5,7 /* number of cache lines to copy */
99
100 cmpldi r10,0
101 li r11,0 /* number cachelines to copy with prefetch */
102 beq .Lnocacheprefetch
103
104 cmpldi r10,PREFETCH_AHEAD
105 li r12,128+8 /* prefetch distance */
106 ble .Llessthanmaxprefetch
107
108 subi r11,r10,PREFETCH_AHEAD
109 li r10,PREFETCH_AHEAD
110
111.Llessthanmaxprefetch:
112 mtctr r10
113
114.LprefetchSRC:
115 dcbt r12,r4
d6ac9329
UD
116 addi r12,r12,128
117 bdnz .LprefetchSRC
057edf90
UD
118
119.Lnocacheprefetch:
120 mtctr r7
121 cmpldi cr1,r5,128
122 clrldi r5,r5,64-7
123 beq cr6,.Lcachelinealigned
124
125.Laligntocacheline:
d6ac9329 126 ld r9,0x08(r4)
057edf90
UD
127 ldu r7,0x10(r4)
128 std r9,0x08(r6)
129 stdu r7,0x10(r6)
130 bdnz .Laligntocacheline
131
132
133.Lcachelinealigned: /* copy while cache lines */
134
d6ac9329 135 blt- cr1,.Llessthancacheline /* size <128 */
057edf90
UD
136
137.Louterloop:
d6ac9329 138 cmpdi r11,0
057edf90
UD
139 mtctr r11
140 beq- .Lendloop
141
142 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
143
144.align 4
145 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
d6ac9329 146.Lloop: /* Copy aligned body */
057edf90
UD
147 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
148 ld r9, 0x08(r4)
149 dcbz r11,r6
150 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
2ccdea26 151 ld r8, 0x18(r4) /* to hide 1st level cache latency. */
057edf90
UD
152 ld r0, 0x20(r4)
153 std r9, 0x08(r6)
154 std r7, 0x10(r6)
155 std r8, 0x18(r6)
156 std r0, 0x20(r6)
157 ld r9, 0x28(r4)
158 ld r7, 0x30(r4)
159 ld r8, 0x38(r4)
160 ld r0, 0x40(r4)
161 std r9, 0x28(r6)
162 std r7, 0x30(r6)
163 std r8, 0x38(r6)
164 std r0, 0x40(r6)
165 ld r9, 0x48(r4)
166 ld r7, 0x50(r4)
167 ld r8, 0x58(r4)
168 ld r0, 0x60(r4)
169 std r9, 0x48(r6)
170 std r7, 0x50(r6)
171 std r8, 0x58(r6)
172 std r0, 0x60(r6)
173 ld r9, 0x68(r4)
174 ld r7, 0x70(r4)
175 ld r8, 0x78(r4)
176 ldu r0, 0x80(r4)
177 std r9, 0x68(r6)
178 std r7, 0x70(r6)
179 std r8, 0x78(r6)
180 stdu r0, 0x80(r6)
181
182 bdnz .Lloop
183
184.Lendloop:
185 cmpdi r10,0
186 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
187 beq- .Lendloop2
188 mtctr r10
189
d6ac9329 190.Lloop2: /* Copy aligned body */
057edf90
UD
191 ld r9, 0x08(r4)
192 ld r7, 0x10(r4)
193 ld r8, 0x18(r4)
194 ldu r0, 0x20(r4)
195 std r9, 0x08(r6)
196 std r7, 0x10(r6)
197 std r8, 0x18(r6)
198 stdu r0, 0x20(r6)
199
200 bdnz .Lloop2
201.Lendloop2:
202
203.Llessthancacheline: /* less than cache to do ? */
204 cmpldi cr0,r5,16
205 srdi r7,r5,4 /* divide size by 16 */
206 blt- .Ldo_lt16
207 mtctr r7
208
209.Lcopy_remaining:
d6ac9329 210 ld r8,0x08(r4)
057edf90
UD
211 ldu r7,0x10(r4)
212 std r8,0x08(r6)
213 stdu r7,0x10(r6)
214 bdnz .Lcopy_remaining
215
216.Ldo_lt16: /* less than 16 ? */
217 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
d6ac9329 218 beqlr+ /* no rest to copy */
057edf90
UD
219 addi r4,r4,8
220 addi r6,r6,8
221
222.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
223 mtcrf 0x01,r5
224 sub r7,r4,r6
225 bf- cr7*4+0,8f
226 ldx r0,r7,r6 /* copy 8 byte */
227 std r0,0(r6)
228 addi r6,r6,8
2298:
230 bf cr7*4+1,4f
231 lwzx r0,r7,r6 /* copy 4 byte */
232 stw r0,0(r6)
233 addi r6,r6,4
2344:
235 bf cr7*4+2,2f
236 lhzx r0,r7,r6 /* copy 2 byte */
237 sth r0,0(r6)
238 addi r6,r6,2
2392:
240 bf cr7*4+3,1f
241 lbzx r0,r7,r6 /* copy 1 byte */
242 stb r0,0(r6)
2431: blr
244
72fd128a 245END_GEN_TB (MEMCPY,TB_TOCLESS)
057edf90 246libc_hidden_builtin_def (memcpy)