]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/cell/memcpy.S
Clean up __MALLOC_* macros.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / cell / memcpy.S
CommitLineData
057edf90 1/* Optimized memcpy implementation for CELL BE PowerPC.
568035b7 2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
057edf90
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
057edf90
UD
18
19#include <sysdep.h>
20#include <bp-sym.h>
21#include <bp-asm.h>
22
23#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
24#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
25
26/* memcpy routine optimized for CELL-BE-PPC v2.0
27 *
28 * The CELL PPC core has 1 integer unit and 1 load/store unit
29 * CELL:
30 * 1st level data cache = 32K
31 * 2nd level data cache = 512K
32 * 3rd level data cache = 0K
33 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
34 * latency to memory is >400 clocks
35 * To improve copy performance we need to prefetch source data
36 * far ahead to hide this latency
37 * For best performance instructionforms ending in "." like "andi."
38 * should be avoided as the are implemented in microcode on CELL.
39 * The below code is loop unrolled for the CELL cache line of 128 bytes
40 */
41
42.align 7
43
44EALIGN (BP_SYM (memcpy), 5, 0)
d6ac9329 45 CALL_MCOUNT 3
057edf90
UD
46
47 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
48 cmpldi cr1,r5,16 /* is size < 16 ? */
d6ac9329 49 mr r6,r3
057edf90
UD
50 blt+ cr1,.Lshortcopy
51
52.Lbigcopy:
53 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
d6ac9329 54 clrldi r8,r8,64-4 /* aling to 16byte boundary */
057edf90
UD
55 sub r7,r4,r3
56 cmpldi cr0,r8,0
57 beq+ .Ldst_aligned
58
59.Ldst_unaligned:
60 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
61 subf r5,r8,r5
62
63 bf cr7*4+3,1f
64 lbzx r0,r7,r6 /* copy 1 byte */
65 stb r0,0(r6)
66 addi r6,r6,1
671: bf cr7*4+2,2f
68 lhzx r0,r7,r6 /* copy 2 byte */
69 sth r0,0(r6)
70 addi r6,r6,2
712: bf cr7*4+1,4f
72 lwzx r0,r7,r6 /* copy 4 byte */
73 stw r0,0(r6)
74 addi r6,r6,4
754: bf cr7*4+0,8f
76 ldx r0,r7,r6 /* copy 8 byte */
77 std r0,0(r6)
78 addi r6,r6,8
798:
80 add r4,r7,r6
81
82.Ldst_aligned:
83
84 cmpdi cr5,r5,128-1
85
86 neg r7,r6
87 addi r6,r6,-8 /* prepare for stdu */
88 addi r4,r4,-8 /* prepare for ldu */
89
90 clrldi r7,r7,64-7 /* align to cacheline boundary */
91 ble+ cr5,.Llessthancacheline
92
93 cmpldi cr6,r7,0
94 subf r5,r7,r5
95 srdi r7,r7,4 /* divide size by 16 */
96 srdi r10,r5,7 /* number of cache lines to copy */
97
98 cmpldi r10,0
99 li r11,0 /* number cachelines to copy with prefetch */
100 beq .Lnocacheprefetch
101
102 cmpldi r10,PREFETCH_AHEAD
103 li r12,128+8 /* prefetch distance */
104 ble .Llessthanmaxprefetch
105
106 subi r11,r10,PREFETCH_AHEAD
107 li r10,PREFETCH_AHEAD
108
109.Llessthanmaxprefetch:
110 mtctr r10
111
112.LprefetchSRC:
113 dcbt r12,r4
d6ac9329
UD
114 addi r12,r12,128
115 bdnz .LprefetchSRC
057edf90
UD
116
117.Lnocacheprefetch:
118 mtctr r7
119 cmpldi cr1,r5,128
120 clrldi r5,r5,64-7
121 beq cr6,.Lcachelinealigned
122
123.Laligntocacheline:
d6ac9329 124 ld r9,0x08(r4)
057edf90
UD
125 ldu r7,0x10(r4)
126 std r9,0x08(r6)
127 stdu r7,0x10(r6)
128 bdnz .Laligntocacheline
129
130
131.Lcachelinealigned: /* copy while cache lines */
132
d6ac9329 133 blt- cr1,.Llessthancacheline /* size <128 */
057edf90
UD
134
135.Louterloop:
d6ac9329 136 cmpdi r11,0
057edf90
UD
137 mtctr r11
138 beq- .Lendloop
139
140 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
141
142.align 4
143 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
d6ac9329 144.Lloop: /* Copy aligned body */
057edf90
UD
145 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
146 ld r9, 0x08(r4)
147 dcbz r11,r6
148 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
149 ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
150 ld r0, 0x20(r4)
151 std r9, 0x08(r6)
152 std r7, 0x10(r6)
153 std r8, 0x18(r6)
154 std r0, 0x20(r6)
155 ld r9, 0x28(r4)
156 ld r7, 0x30(r4)
157 ld r8, 0x38(r4)
158 ld r0, 0x40(r4)
159 std r9, 0x28(r6)
160 std r7, 0x30(r6)
161 std r8, 0x38(r6)
162 std r0, 0x40(r6)
163 ld r9, 0x48(r4)
164 ld r7, 0x50(r4)
165 ld r8, 0x58(r4)
166 ld r0, 0x60(r4)
167 std r9, 0x48(r6)
168 std r7, 0x50(r6)
169 std r8, 0x58(r6)
170 std r0, 0x60(r6)
171 ld r9, 0x68(r4)
172 ld r7, 0x70(r4)
173 ld r8, 0x78(r4)
174 ldu r0, 0x80(r4)
175 std r9, 0x68(r6)
176 std r7, 0x70(r6)
177 std r8, 0x78(r6)
178 stdu r0, 0x80(r6)
179
180 bdnz .Lloop
181
182.Lendloop:
183 cmpdi r10,0
184 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
185 beq- .Lendloop2
186 mtctr r10
187
d6ac9329 188.Lloop2: /* Copy aligned body */
057edf90
UD
189 ld r9, 0x08(r4)
190 ld r7, 0x10(r4)
191 ld r8, 0x18(r4)
192 ldu r0, 0x20(r4)
193 std r9, 0x08(r6)
194 std r7, 0x10(r6)
195 std r8, 0x18(r6)
196 stdu r0, 0x20(r6)
197
198 bdnz .Lloop2
199.Lendloop2:
200
201.Llessthancacheline: /* less than cache to do ? */
202 cmpldi cr0,r5,16
203 srdi r7,r5,4 /* divide size by 16 */
204 blt- .Ldo_lt16
205 mtctr r7
206
207.Lcopy_remaining:
d6ac9329 208 ld r8,0x08(r4)
057edf90
UD
209 ldu r7,0x10(r4)
210 std r8,0x08(r6)
211 stdu r7,0x10(r6)
212 bdnz .Lcopy_remaining
213
214.Ldo_lt16: /* less than 16 ? */
215 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
d6ac9329 216 beqlr+ /* no rest to copy */
057edf90
UD
217 addi r4,r4,8
218 addi r6,r6,8
219
220.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
221 mtcrf 0x01,r5
222 sub r7,r4,r6
223 bf- cr7*4+0,8f
224 ldx r0,r7,r6 /* copy 8 byte */
225 std r0,0(r6)
226 addi r6,r6,8
2278:
228 bf cr7*4+1,4f
229 lwzx r0,r7,r6 /* copy 4 byte */
230 stw r0,0(r6)
231 addi r6,r6,4
2324:
233 bf cr7*4+2,2f
234 lhzx r0,r7,r6 /* copy 2 byte */
235 sth r0,0(r6)
236 addi r6,r6,2
2372:
238 bf cr7*4+3,1f
239 lbzx r0,r7,r6 /* copy 1 byte */
240 stb r0,0(r6)
2411: blr
242
243END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
244libc_hidden_builtin_def (memcpy)