]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/cell/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / cell / memcpy.S
CommitLineData
057edf90 1/* Optimized memcpy implementation for CELL BE PowerPC.
d614a753 2 Copyright (C) 2010-2020 Free Software Foundation, Inc.
057edf90
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
057edf90
UD
18
19#include <sysdep.h>
057edf90
UD
20
21#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
22#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
23
24/* memcpy routine optimized for CELL-BE-PPC v2.0
25 *
26 * The CELL PPC core has 1 integer unit and 1 load/store unit
27 * CELL:
28 * 1st level data cache = 32K
29 * 2nd level data cache = 512K
30 * 3rd level data cache = 0K
31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32 * latency to memory is >400 clocks
33 * To improve copy performance we need to prefetch source data
34 * far ahead to hide this latency
2ccdea26 35 * For best performance instruction forms ending in "." like "andi."
057edf90
UD
36 * should be avoided as the are implemented in microcode on CELL.
37 * The below code is loop unrolled for the CELL cache line of 128 bytes
38 */
39
40.align 7
41
b5510883 42EALIGN (memcpy, 5, 0)
d6ac9329 43 CALL_MCOUNT
057edf90
UD
44
45 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
46 cmplwi cr1,r5,16 /* is size < 16 ? */
d6ac9329 47 mr r6,r3
057edf90
UD
48 blt+ cr1,.Lshortcopy
49
50.Lbigcopy:
51 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
c0c3f78a 52 clrlwi r8,r8,32-4 /* align to 16byte boundary */
057edf90
UD
53 sub r7,r4,r3
54 cmplwi cr0,r8,0
55 beq+ .Ldst_aligned
56
57.Ldst_unaligned:
58 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
59 subf r5,r8,r5
60
61 bf cr7*4+3,1f
62 lbzx r0,r7,r6 /* copy 1 byte */
63 stb r0,0(r6)
64 addi r6,r6,1
651: bf cr7*4+2,2f
66 lhzx r0,r7,r6 /* copy 2 byte */
67 sth r0,0(r6)
68 addi r6,r6,2
692: bf cr7*4+1,4f
70 lwzx r0,r7,r6 /* copy 4 byte */
71 stw r0,0(r6)
72 addi r6,r6,4
734: bf cr7*4+0,8f
74 lfdx fp9,r7,r6 /* copy 8 byte */
75 stfd fp9,0(r6)
76 addi r6,r6,8
778:
78 add r4,r7,r6
79
80.Ldst_aligned:
81
82 cmpwi cr5,r5,128-1
83
84 neg r7,r6
85 addi r6,r6,-8 /* prepare for stfdu */
86 addi r4,r4,-8 /* prepare for lfdu */
87
88 clrlwi r7,r7,32-7 /* align to cacheline boundary */
89 ble+ cr5,.Llessthancacheline
90
91 cmplwi cr6,r7,0
92 subf r5,r7,r5
93 srwi r7,r7,4 /* divide size by 16 */
94 srwi r10,r5,7 /* number of cache lines to copy */
95
96 cmplwi r10,0
97 li r11,0 /* number cachelines to copy with prefetch */
98 beq .Lnocacheprefetch
99
100 cmplwi r10,PREFETCH_AHEAD
101 li r12,128+8 /* prefetch distance */
102 ble .Llessthanmaxprefetch
103
104 subi r11,r10,PREFETCH_AHEAD
105 li r10,PREFETCH_AHEAD
106
107.Llessthanmaxprefetch:
108 mtctr r10
109
110.LprefetchSRC:
111 dcbt r12,r4
d6ac9329
UD
112 addi r12,r12,128
113 bdnz .LprefetchSRC
057edf90
UD
114
115.Lnocacheprefetch:
116 mtctr r7
117 cmplwi cr1,r5,128
118 clrlwi r5,r5,32-7
119 beq cr6,.Lcachelinealigned
120
121.Laligntocacheline:
d6ac9329 122 lfd fp9,0x08(r4)
057edf90
UD
123 lfdu fp10,0x10(r4)
124 stfd fp9,0x08(r6)
125 stfdu fp10,0x10(r6)
126 bdnz .Laligntocacheline
127
128
129.Lcachelinealigned: /* copy while cache lines */
130
d6ac9329 131 blt- cr1,.Llessthancacheline /* size <128 */
057edf90
UD
132
133.Louterloop:
d6ac9329 134 cmpwi r11,0
057edf90
UD
135 mtctr r11
136 beq- .Lendloop
137
138 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
139
140.align 4
141 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
d6ac9329 142.Lloop: /* Copy aligned body */
057edf90
UD
143 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
144 lfd fp9, 0x08(r4)
145 dcbz r11,r6
146 lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
2ccdea26 147 lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */
057edf90
UD
148 lfd fp12, 0x20(r4)
149 stfd fp9, 0x08(r6)
150 stfd fp10, 0x10(r6)
151 stfd fp11, 0x18(r6)
152 stfd fp12, 0x20(r6)
153 lfd fp9, 0x28(r4)
154 lfd fp10, 0x30(r4)
155 lfd fp11, 0x38(r4)
156 lfd fp12, 0x40(r4)
157 stfd fp9, 0x28(r6)
158 stfd fp10, 0x30(r6)
159 stfd fp11, 0x38(r6)
160 stfd fp12, 0x40(r6)
161 lfd fp9, 0x48(r4)
162 lfd fp10, 0x50(r4)
163 lfd fp11, 0x58(r4)
164 lfd fp12, 0x60(r4)
165 stfd fp9, 0x48(r6)
166 stfd fp10, 0x50(r6)
167 stfd fp11, 0x58(r6)
168 stfd fp12, 0x60(r6)
169 lfd fp9, 0x68(r4)
170 lfd fp10, 0x70(r4)
171 lfd fp11, 0x78(r4)
172 lfdu fp12, 0x80(r4)
173 stfd fp9, 0x68(r6)
174 stfd fp10, 0x70(r6)
175 stfd fp11, 0x78(r6)
176 stfdu fp12, 0x80(r6)
177
178 bdnz .Lloop
179
180.Lendloop:
181 cmpwi r10,0
182 slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
183 beq- .Lendloop2
184 mtctr r10
185
d6ac9329 186.Lloop2: /* Copy aligned body */
057edf90
UD
187 lfd fp9, 0x08(r4)
188 lfd fp10, 0x10(r4)
189 lfd fp11, 0x18(r4)
190 lfdu fp12, 0x20(r4)
191 stfd fp9, 0x08(r6)
192 stfd fp10, 0x10(r6)
193 stfd fp11, 0x18(r6)
194 stfdu fp12, 0x20(r6)
195
196 bdnz .Lloop2
197.Lendloop2:
198
199.Llessthancacheline: /* less than cache to do ? */
200 cmplwi cr0,r5,16
201 srwi r7,r5,4 /* divide size by 16 */
202 blt- .Ldo_lt16
203 mtctr r7
204
205.Lcopy_remaining:
d6ac9329 206 lfd fp9,0x08(r4)
057edf90
UD
207 lfdu fp10,0x10(r4)
208 stfd fp9,0x08(r6)
209 stfdu fp10,0x10(r6)
210 bdnz .Lcopy_remaining
211
212.Ldo_lt16: /* less than 16 ? */
213 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
d6ac9329 214 beqlr+ /* no rest to copy */
057edf90
UD
215 addi r4,r4,8
216 addi r6,r6,8
217
218.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
219 mtcrf 0x01,r5
220 sub r7,r4,r6
221 bf- cr7*4+0,8f
222 lfdx fp9,r7,r6 /* copy 8 byte */
223 stfd fp9,0(r6)
224 addi r6,r6,8
2258:
226 bf cr7*4+1,4f
227 lwzx r0,r7,r6 /* copy 4 byte */
228 stw r0,0(r6)
229 addi r6,r6,4
2304:
231 bf cr7*4+2,2f
232 lhzx r0,r7,r6 /* copy 2 byte */
233 sth r0,0(r6)
234 addi r6,r6,2
2352:
236 bf cr7*4+3,1f
237 lbzx r0,r7,r6 /* copy 1 byte */
238 stb r0,0(r6)
2391: blr
240
b5510883 241END (memcpy)
057edf90 242libc_hidden_builtin_def (memcpy)