]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/cell/memcpy.S
82f072405f0d33ee812b1c7e9a7f67cde9e5291a
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / cell / memcpy.S
1 /* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #ifndef MEMCPY
22 # define MEMCPY memcpy
23 #endif
24
25 #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
26 #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
27
28 /* memcpy routine optimized for CELL-BE-PPC v2.0
29 *
30 * The CELL PPC core has 1 integer unit and 1 load/store unit
31 * CELL:
32 * 1st level data cache = 32K
33 * 2nd level data cache = 512K
34 * 3rd level data cache = 0K
35 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
36 * latency to memory is >400 clocks
37 * To improve copy performance we need to prefetch source data
38 * far ahead to hide this latency
39 * For best performance instruction forms ending in "." like "andi."
40 * should be avoided as the are implemented in microcode on CELL.
41 * The below code is loop unrolled for the CELL cache line of 128 bytes
42 */
43
44 .align 7
45
46 ENTRY_TOCLESS (MEMCPY, 5)
47 CALL_MCOUNT 3
48
49 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
50 cmpldi cr1,r5,16 /* is size < 16 ? */
51 mr r6,r3
52 blt+ cr1,.Lshortcopy
53
54 .Lbigcopy:
55 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
56 clrldi r8,r8,64-4 /* align to 16byte boundary */
57 sub r7,r4,r3
58 cmpldi cr0,r8,0
59 beq+ .Ldst_aligned
60
61 .Ldst_unaligned:
62 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
63 subf r5,r8,r5
64
65 bf cr7*4+3,1f
66 lbzx r0,r7,r6 /* copy 1 byte */
67 stb r0,0(r6)
68 addi r6,r6,1
69 1: bf cr7*4+2,2f
70 lhzx r0,r7,r6 /* copy 2 byte */
71 sth r0,0(r6)
72 addi r6,r6,2
73 2: bf cr7*4+1,4f
74 lwzx r0,r7,r6 /* copy 4 byte */
75 stw r0,0(r6)
76 addi r6,r6,4
77 4: bf cr7*4+0,8f
78 ldx r0,r7,r6 /* copy 8 byte */
79 std r0,0(r6)
80 addi r6,r6,8
81 8:
82 add r4,r7,r6
83
84 .Ldst_aligned:
85
86 cmpdi cr5,r5,128-1
87
88 neg r7,r6
89 addi r6,r6,-8 /* prepare for stdu */
90 addi r4,r4,-8 /* prepare for ldu */
91
92 clrldi r7,r7,64-7 /* align to cacheline boundary */
93 ble+ cr5,.Llessthancacheline
94
95 cmpldi cr6,r7,0
96 subf r5,r7,r5
97 srdi r7,r7,4 /* divide size by 16 */
98 srdi r10,r5,7 /* number of cache lines to copy */
99
100 cmpldi r10,0
101 li r11,0 /* number cachelines to copy with prefetch */
102 beq .Lnocacheprefetch
103
104 cmpldi r10,PREFETCH_AHEAD
105 li r12,128+8 /* prefetch distance */
106 ble .Llessthanmaxprefetch
107
108 subi r11,r10,PREFETCH_AHEAD
109 li r10,PREFETCH_AHEAD
110
111 .Llessthanmaxprefetch:
112 mtctr r10
113
114 .LprefetchSRC:
115 dcbt r12,r4
116 addi r12,r12,128
117 bdnz .LprefetchSRC
118
119 .Lnocacheprefetch:
120 mtctr r7
121 cmpldi cr1,r5,128
122 clrldi r5,r5,64-7
123 beq cr6,.Lcachelinealigned
124
125 .Laligntocacheline:
126 ld r9,0x08(r4)
127 ldu r7,0x10(r4)
128 std r9,0x08(r6)
129 stdu r7,0x10(r6)
130 bdnz .Laligntocacheline
131
132
133 .Lcachelinealigned: /* copy while cache lines */
134
135 blt- cr1,.Llessthancacheline /* size <128 */
136
137 .Louterloop:
138 cmpdi r11,0
139 mtctr r11
140 beq- .Lendloop
141
142 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
143
144 .align 4
145 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
146 .Lloop: /* Copy aligned body */
147 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
148 ld r9, 0x08(r4)
149 dcbz r11,r6
150 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
151 ld r8, 0x18(r4) /* to hide 1st level cache latency. */
152 ld r0, 0x20(r4)
153 std r9, 0x08(r6)
154 std r7, 0x10(r6)
155 std r8, 0x18(r6)
156 std r0, 0x20(r6)
157 ld r9, 0x28(r4)
158 ld r7, 0x30(r4)
159 ld r8, 0x38(r4)
160 ld r0, 0x40(r4)
161 std r9, 0x28(r6)
162 std r7, 0x30(r6)
163 std r8, 0x38(r6)
164 std r0, 0x40(r6)
165 ld r9, 0x48(r4)
166 ld r7, 0x50(r4)
167 ld r8, 0x58(r4)
168 ld r0, 0x60(r4)
169 std r9, 0x48(r6)
170 std r7, 0x50(r6)
171 std r8, 0x58(r6)
172 std r0, 0x60(r6)
173 ld r9, 0x68(r4)
174 ld r7, 0x70(r4)
175 ld r8, 0x78(r4)
176 ldu r0, 0x80(r4)
177 std r9, 0x68(r6)
178 std r7, 0x70(r6)
179 std r8, 0x78(r6)
180 stdu r0, 0x80(r6)
181
182 bdnz .Lloop
183
184 .Lendloop:
185 cmpdi r10,0
186 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
187 beq- .Lendloop2
188 mtctr r10
189
190 .Lloop2: /* Copy aligned body */
191 ld r9, 0x08(r4)
192 ld r7, 0x10(r4)
193 ld r8, 0x18(r4)
194 ldu r0, 0x20(r4)
195 std r9, 0x08(r6)
196 std r7, 0x10(r6)
197 std r8, 0x18(r6)
198 stdu r0, 0x20(r6)
199
200 bdnz .Lloop2
201 .Lendloop2:
202
203 .Llessthancacheline: /* less than cache to do ? */
204 cmpldi cr0,r5,16
205 srdi r7,r5,4 /* divide size by 16 */
206 blt- .Ldo_lt16
207 mtctr r7
208
209 .Lcopy_remaining:
210 ld r8,0x08(r4)
211 ldu r7,0x10(r4)
212 std r8,0x08(r6)
213 stdu r7,0x10(r6)
214 bdnz .Lcopy_remaining
215
216 .Ldo_lt16: /* less than 16 ? */
217 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
218 beqlr+ /* no rest to copy */
219 addi r4,r4,8
220 addi r6,r6,8
221
222 .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
223 mtcrf 0x01,r5
224 sub r7,r4,r6
225 bf- cr7*4+0,8f
226 ldx r0,r7,r6 /* copy 8 byte */
227 std r0,0(r6)
228 addi r6,r6,8
229 8:
230 bf cr7*4+1,4f
231 lwzx r0,r7,r6 /* copy 4 byte */
232 stw r0,0(r6)
233 addi r6,r6,4
234 4:
235 bf cr7*4+2,2f
236 lhzx r0,r7,r6 /* copy 2 byte */
237 sth r0,0(r6)
238 addi r6,r6,2
239 2:
240 bf cr7*4+3,1f
241 lbzx r0,r7,r6 /* copy 1 byte */
242 stb r0,0(r6)
243 1: blr
244
245 END_GEN_TB (MEMCPY,TB_TOCLESS)
246 libc_hidden_builtin_def (memcpy)