]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/memcpy.S
Fix whitespaces.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
19
20 #include <sysdep.h>
21 #include <bp-sym.h>
22 #include <bp-asm.h>
23
24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
25 Returns 'dst'.
26
27 Memcpy handles short copies (< 32-bytes) using a binary move blocks
28 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
29 with the appropriate combination of byte and halfword load/stores.
30 There is minimal effort to optimize the alignment of short moves.
31 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
32 of handling unligned load/stores that do not cross 32-byte boundries.
33
34 Longer moves (>= 32-bytes) justify the effort to get at least the
35 destination doubleword (8-byte) aligned. Further optimization is
36 posible when both source and destination are doubleword aligned.
37 Each case has a optimized unrolled loop. */
38
39 EALIGN (BP_SYM (memcpy), 5, 0)
40 CALL_MCOUNT 3
41
42 cmpldi cr1,5,31
43 neg 0,3
44 std 3,-16(1)
45 std 31,-8(1)
46 cfi_offset(31,-8)
47 andi. 11,3,7 /* check alignement of dst. */
48 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
49 clrldi 10,4,61 /* check alignement of src. */
50 cmpldi cr6,5,8
51 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
52 cmpld cr6,10,11
53 mr 12,4
54 srdi 9,5,3 /* Number of full double words remaining. */
55 mtcrf 0x01,0
56 mr 31,5
57 beq .L0
58
59 subf 31,0,5
60 /* Move 0-7 bytes as needed to get the destination doubleword alligned. */
61 1: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
66 2: bf 30,4f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
71 4: bf 29,0f
72 lwz 6,0(12)
73 addi 12,12,4
74 stw 6,0(3)
75 addi 3,3,4
76 0:
77 clrldi 10,12,61 /* check alignement of src again. */
78 srdi 9,31,3 /* Number of full double words remaining. */
79
80 /* Copy doublewords from source to destination, assumpting the
81 destination is aligned on a doubleword boundary.
82
83 At this point we know there are at least 25 bytes left (32-7) to copy.
84 The next step is to determine if the source is also doubleword aligned.
85 If not branch to the unaligned move code at .L6. which uses
86 a load, shift, store strategy.
87
88 Otherwise source and destination are doubleword aligned, and we can
89 the optimized doubleword copy loop. */
90 .L0:
91 clrldi 11,31,61
92 mtcrf 0x01,9
93 bne- cr6,.L6 /* If source is not DW aligned. */
94
95 /* Move doublewords where destination and source are DW aligned.
96 Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
97 If the copy is not an exact multiple of 32 bytes, 1-3
98 doublewords are copied as needed to set up the main loop. After
99 the main loop exits there may be a tail of 1-7 bytes. These byte are
100 copied a word/halfword/byte at a time as needed to preserve alignment. */
101
102 srdi 8,31,5
103 cmpldi cr1,9,4
104 cmpldi cr6,11,0
105 mr 11,12
106
107 bf 30,1f
108 ld 6,0(12)
109 ld 7,8(12)
110 addi 11,12,16
111 mtctr 8
112 std 6,0(3)
113 std 7,8(3)
114 addi 10,3,16
115 bf 31,4f
116 ld 0,16(12)
117 std 0,16(3)
118 blt cr1,3f
119 addi 11,12,24
120 addi 10,3,24
121 b 4f
122 .align 4
123 1:
124 mr 10,3
125 mtctr 8
126 bf 31,4f
127 ld 6,0(12)
128 addi 11,12,8
129 std 6,0(3)
130 addi 10,3,8
131
132 .align 4
133 4:
134 ld 6,0(11)
135 ld 7,8(11)
136 ld 8,16(11)
137 ld 0,24(11)
138 addi 11,11,32
139 2:
140 std 6,0(10)
141 std 7,8(10)
142 std 8,16(10)
143 std 0,24(10)
144 addi 10,10,32
145 bdnz 4b
146 3:
147
148 rldicr 0,31,0,60
149 mtcrf 0x01,31
150 beq cr6,0f
151 .L9:
152 add 3,3,0
153 add 12,12,0
154
155 /* At this point we have a tail of 0-7 bytes and we know that the
156 destiniation is double word aligned. */
157 4: bf 29,2f
158 lwz 6,0(12)
159 addi 12,12,4
160 stw 6,0(3)
161 addi 3,3,4
162 2: bf 30,1f
163 lhz 6,0(12)
164 addi 12,12,2
165 sth 6,0(3)
166 addi 3,3,2
167 1: bf 31,0f
168 lbz 6,0(12)
169 stb 6,0(3)
170 0:
171 /* Return original dst pointer. */
172 ld 31,-8(1)
173 ld 3,-16(1)
174 blr
175
176 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
177 bytes. Each case is handled without loops, using binary (1,2,4,8)
178 tests.
179
180 In the short (0-8 byte) case no attempt is made to force alignment
181 of either source or destination. The hardware will handle the
182 unaligned load/stores with small delays for crossing 32- 64-byte, and
183 4096-byte boundaries. Since these short moves are unlikely to be
184 unaligned or cross these boundaries, the overhead to force
185 alignment is not justified.
186
187 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
188 boundaries. Since only loads are sensitive to the 32-/64-byte
189 boundaries it is more important to align the source then the
190 destination. If the source is not already word aligned, we first
191 move 1-3 bytes as needed. Since we are only word aligned we don't
192 use double word load/stores to insure that all loads are aligned.
193 While the destination and stores may still be unaligned, this
194 is only an issue for page (4096 byte boundary) crossing, which
195 should be rare for these short moves. The hardware handles this
196 case automatically with a small delay. */
197
198 .align 4
199 .L2:
200 mtcrf 0x01,5
201 neg 8,4
202 clrrdi 11,4,2
203 andi. 0,8,3
204 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
205 /* At least 9 bytes left. Get the source word aligned. */
206 cmpldi cr1,5,16
207 mr 10,5
208 mr 12,4
209 cmpldi cr6,0,2
210 beq .L3 /* If the source is already word aligned skip this. */
211 /* Copy 1-3 bytes to get source address word aligned. */
212 lwz 6,0(11)
213 subf 10,0,5
214 add 12,4,0
215 blt cr6,5f
216 srdi 7,6,16
217 bgt cr6,3f
218 sth 6,0(3)
219 b 7f
220 .align 4
221 3:
222 stb 7,0(3)
223 sth 6,1(3)
224 b 7f
225 .align 4
226 5:
227 stb 6,0(3)
228 7:
229 cmpldi cr1,10,16
230 add 3,3,0
231 mtcrf 0x01,10
232 .align 4
233 .L3:
234 /* At least 6 bytes left and the source is word aligned. */
235 blt cr1,8f
236 16: /* Move 16 bytes. */
237 lwz 6,0(12)
238 lwz 7,4(12)
239 stw 6,0(3)
240 lwz 6,8(12)
241 stw 7,4(3)
242 lwz 7,12(12)
243 addi 12,12,16
244 stw 6,8(3)
245 stw 7,12(3)
246 addi 3,3,16
247 8: /* Move 8 bytes. */
248 bf 28,4f
249 lwz 6,0(12)
250 lwz 7,4(12)
251 addi 12,12,8
252 stw 6,0(3)
253 stw 7,4(3)
254 addi 3,3,8
255 4: /* Move 4 bytes. */
256 bf 29,2f
257 lwz 6,0(12)
258 addi 12,12,4
259 stw 6,0(3)
260 addi 3,3,4
261 2: /* Move 2-3 bytes. */
262 bf 30,1f
263 lhz 6,0(12)
264 sth 6,0(3)
265 bf 31,0f
266 lbz 7,2(12)
267 stb 7,2(3)
268 ld 3,-16(1)
269 blr
270 1: /* Move 1 byte. */
271 bf 31,0f
272 lbz 6,0(12)
273 stb 6,0(3)
274 0:
275 /* Return original dst pointer. */
276 ld 3,-16(1)
277 blr
278
279 /* Special case to copy 0-8 bytes. */
280 .align 4
281 .LE8:
282 mr 12,4
283 bne cr6,4f
284 /* Would have liked to use use ld/std here but the 630 processors are
285 slow for load/store doubles that are not at least word aligned.
286 Unaligned Load/Store word execute with only a 1 cycle penaltity. */
287 lwz 6,0(4)
288 lwz 7,4(4)
289 stw 6,0(3)
290 stw 7,4(3)
291 /* Return original dst pointer. */
292 ld 3,-16(1)
293 blr
294 .align 4
295 4: bf 29,2b
296 lwz 6,0(4)
297 stw 6,0(3)
298 6:
299 bf 30,5f
300 lhz 7,4(4)
301 sth 7,4(3)
302 bf 31,0f
303 lbz 8,6(4)
304 stb 8,6(3)
305 ld 3,-16(1)
306 blr
307 .align 4
308 5:
309 bf 31,0f
310 lbz 6,4(4)
311 stb 6,4(3)
312 .align 4
313 0:
314 /* Return original dst pointer. */
315 ld 3,-16(1)
316 blr
317
318 .align 4
319 .L6:
320
321 /* Copy doublewords where the destination is aligned but the source is
322 not. Use aligned doubleword loads from the source, shifted to realign
323 the data, to allow aligned destination stores. */
324 subf 5,10,12
325 andi. 0,9,1
326 cmpldi cr6,11,0
327 sldi 10,10,3
328 mr 11,9
329 mr 4,3
330 ld 6,0(5)
331 ld 7,8(5)
332 subfic 9,10,64
333 beq 2f
334 sld 0,6,10
335 cmpldi 11,1
336 mr 6,7
337 addi 4,4,-8
338 addi 11,11,-1
339 b 1f
340 2: addi 5,5,8
341 .align 4
342 0: sld 0,6,10
343 srd 8,7,9
344 cmpldi 11,2
345 ld 6,8(5)
346 or 0,0,8
347 addi 11,11,-2
348 std 0,0(4)
349 sld 0,7,10
350 1: srd 8,6,9
351 or 0,0,8
352 beq 8f
353 ld 7,16(5)
354 std 0,8(4)
355 addi 5,5,16
356 addi 4,4,16
357 b 0b
358 .align 4
359 8:
360 std 0,8(4)
361 rldicr 0,31,0,60
362 mtcrf 0x01,31
363 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
364 /* Return original dst pointer. */
365 ld 31,-8(1)
366 ld 3,-16(1)
367 blr
368 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
369 libc_hidden_builtin_def (memcpy)