]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memcpy.S
CommitLineData
a14b373c 1/* Optimized memcpy implementation for PowerPC64.
d614a753 2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
a14b373c
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
a14b373c
UD
18
19#include <sysdep.h>
a14b373c 20
f17a4233 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
a14b373c
UD
22 Returns 'dst'.
23
7a41d99a
UD
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
a8870a61 28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
2ccdea26 29 of handling unaligned load/stores that do not cross 32-byte boundaries.
a14b373c
UD
30
31 Longer moves (>= 32-bytes) justify the effort to get at least the
32 destination doubleword (8-byte) aligned. Further optimization is
2ccdea26 33 possible when both source and destination are doubleword aligned.
a14b373c
UD
34 Each case has a optimized unrolled loop. */
35
72fd128a
WSM
36#ifndef MEMCPY
37# define MEMCPY memcpy
38#endif
39
d5b41185 40ENTRY_TOCLESS (MEMCPY, 5)
d7d06f79
UD
41 CALL_MCOUNT 3
42
a14b373c
UD
43 cmpldi cr1,5,31
44 neg 0,3
a8870a61 45 std 3,-16(1)
a14b373c 46 std 31,-8(1)
3e7e947f 47 cfi_offset(31,-8)
2ccdea26 48 andi. 11,3,7 /* check alignment of dst. */
a8870a61 49 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
2ccdea26 50 clrldi 10,4,61 /* check alignment of src. */
a8870a61
UD
51 cmpldi cr6,5,8
52 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
7a41d99a 53 cmpld cr6,10,11
a14b373c 54 mr 12,4
a8870a61
UD
55 srdi 9,5,3 /* Number of full double words remaining. */
56 mtcrf 0x01,0
a14b373c 57 mr 31,5
a8870a61 58 beq .L0
7a41d99a 59
a14b373c 60 subf 31,0,5
2ccdea26 61 /* Move 0-7 bytes as needed to get the destination doubleword aligned. */
a14b373c
UD
621: bf 31,2f
63 lbz 6,0(12)
64 addi 12,12,1
65 stb 6,0(3)
66 addi 3,3,1
672: bf 30,4f
68 lhz 6,0(12)
69 addi 12,12,2
70 sth 6,0(3)
71 addi 3,3,2
724: bf 29,0f
73 lwz 6,0(12)
74 addi 12,12,4
75 stw 6,0(3)
76 addi 3,3,4
770:
2ccdea26 78 clrldi 10,12,61 /* check alignment of src again. */
a8870a61 79 srdi 9,31,3 /* Number of full double words remaining. */
7a41d99a 80
2ccdea26 81 /* Copy doublewords from source to destination, assuming the
a14b373c
UD
82 destination is aligned on a doubleword boundary.
83
a8870a61 84 At this point we know there are at least 25 bytes left (32-7) to copy.
7a41d99a 85 The next step is to determine if the source is also doubleword aligned.
a8870a61
UD
86 If not branch to the unaligned move code at .L6. which uses
87 a load, shift, store strategy.
7a41d99a 88
a8870a61
UD
89 Otherwise source and destination are doubleword aligned, and we can
90 the optimized doubleword copy loop. */
91.L0:
92 clrldi 11,31,61
93 mtcrf 0x01,9
94 bne- cr6,.L6 /* If source is not DW aligned. */
a14b373c 95
a8870a61 96 /* Move doublewords where destination and source are DW aligned.
a14b373c 97 Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
7a41d99a 98 If the copy is not an exact multiple of 32 bytes, 1-3
a8870a61 99 doublewords are copied as needed to set up the main loop. After
7a41d99a 100 the main loop exits there may be a tail of 1-7 bytes. These byte are
a8870a61
UD
101 copied a word/halfword/byte at a time as needed to preserve alignment. */
102
103 srdi 8,31,5
a14b373c 104 cmpldi cr1,9,4
a8870a61
UD
105 cmpldi cr6,11,0
106 mr 11,12
7a41d99a 107
a8870a61
UD
108 bf 30,1f
109 ld 6,0(12)
110 ld 7,8(12)
111 addi 11,12,16
112 mtctr 8
113 std 6,0(3)
114 std 7,8(3)
115 addi 10,3,16
116 bf 31,4f
117 ld 0,16(12)
7a41d99a 118 std 0,16(3)
a8870a61
UD
119 blt cr1,3f
120 addi 11,12,24
121 addi 10,3,24
122 b 4f
123 .align 4
1241:
125 mr 10,3
126 mtctr 8
127 bf 31,4f
128 ld 6,0(12)
129 addi 11,12,8
130 std 6,0(3)
131 addi 10,3,8
7a41d99a 132
a14b373c
UD
133 .align 4
1344:
a14b373c
UD
135 ld 6,0(11)
136 ld 7,8(11)
a8870a61
UD
137 ld 8,16(11)
138 ld 0,24(11)
139 addi 11,11,32
1402:
a14b373c
UD
141 std 6,0(10)
142 std 7,8(10)
a8870a61
UD
143 std 8,16(10)
144 std 0,24(10)
145 addi 10,10,32
146 bdnz 4b
7a41d99a 1473:
a14b373c 148
a14b373c 149 rldicr 0,31,0,60
a8870a61
UD
150 mtcrf 0x01,31
151 beq cr6,0f
152.L9:
a14b373c
UD
153 add 3,3,0
154 add 12,12,0
7a41d99a 155
a8870a61 156/* At this point we have a tail of 0-7 bytes and we know that the
2ccdea26 157 destination is double word aligned. */
a8870a61
UD
1584: bf 29,2f
159 lwz 6,0(12)
a14b373c 160 addi 12,12,4
a14b373c 161 stw 6,0(3)
a14b373c 162 addi 3,3,4
a14b373c
UD
1632: bf 30,1f
164 lhz 6,0(12)
165 addi 12,12,2
166 sth 6,0(3)
167 addi 3,3,2
1681: bf 31,0f
169 lbz 6,0(12)
a14b373c 170 stb 6,0(3)
a14b373c
UD
1710:
172 /* Return original dst pointer. */
173 ld 31,-8(1)
a8870a61
UD
174 ld 3,-16(1)
175 blr
7a41d99a
UD
176
177/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
178 bytes. Each case is handled without loops, using binary (1,2,4,8)
179 tests.
180
a8870a61 181 In the short (0-8 byte) case no attempt is made to force alignment
7a41d99a
UD
182 of either source or destination. The hardware will handle the
183 unaligned load/stores with small delays for crossing 32- 64-byte, and
a8870a61 184 4096-byte boundaries. Since these short moves are unlikely to be
7a41d99a 185 unaligned or cross these boundaries, the overhead to force
a8870a61 186 alignment is not justified.
7a41d99a 187
a8870a61
UD
188 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
189 boundaries. Since only loads are sensitive to the 32-/64-byte
7a41d99a 190 boundaries it is more important to align the source then the
a8870a61 191 destination. If the source is not already word aligned, we first
7a41d99a
UD
192 move 1-3 bytes as needed. Since we are only word aligned we don't
193 use double word load/stores to insure that all loads are aligned.
a8870a61
UD
194 While the destination and stores may still be unaligned, this
195 is only an issue for page (4096 byte boundary) crossing, which
196 should be rare for these short moves. The hardware handles this
7a41d99a
UD
197 case automatically with a small delay. */
198
a8870a61
UD
199 .align 4
200.L2:
201 mtcrf 0x01,5
202 neg 8,4
203 clrrdi 11,4,2
204 andi. 0,8,3
205 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
206/* At least 9 bytes left. Get the source word aligned. */
207 cmpldi cr1,5,16
208 mr 10,5
209 mr 12,4
210 cmpldi cr6,0,2
211 beq .L3 /* If the source is already word aligned skip this. */
212/* Copy 1-3 bytes to get source address word aligned. */
213 lwz 6,0(11)
214 subf 10,0,5
215 add 12,4,0
216 blt cr6,5f
217 srdi 7,6,16
218 bgt cr6,3f
759cfef3
AM
219#ifdef __LITTLE_ENDIAN__
220 sth 7,0(3)
221#else
a8870a61 222 sth 6,0(3)
759cfef3 223#endif
a8870a61
UD
224 b 7f
225 .align 4
2263:
759cfef3
AM
227#ifdef __LITTLE_ENDIAN__
228 rotlwi 6,6,24
229 stb 6,0(3)
230 sth 7,1(3)
231#else
a8870a61
UD
232 stb 7,0(3)
233 sth 6,1(3)
759cfef3 234#endif
a8870a61
UD
235 b 7f
236 .align 4
2375:
759cfef3
AM
238#ifdef __LITTLE_ENDIAN__
239 rotlwi 6,6,8
240#endif
a8870a61
UD
241 stb 6,0(3)
2427:
243 cmpldi cr1,10,16
244 add 3,3,0
245 mtcrf 0x01,10
246 .align 4
247.L3:
248/* At least 6 bytes left and the source is word aligned. */
249 blt cr1,8f
25016: /* Move 16 bytes. */
251 lwz 6,0(12)
252 lwz 7,4(12)
253 stw 6,0(3)
254 lwz 6,8(12)
255 stw 7,4(3)
256 lwz 7,12(12)
257 addi 12,12,16
258 stw 6,8(3)
259 stw 7,12(3)
260 addi 3,3,16
2618: /* Move 8 bytes. */
262 bf 28,4f
263 lwz 6,0(12)
264 lwz 7,4(12)
265 addi 12,12,8
266 stw 6,0(3)
267 stw 7,4(3)
268 addi 3,3,8
2694: /* Move 4 bytes. */
270 bf 29,2f
271 lwz 6,0(12)
272 addi 12,12,4
273 stw 6,0(3)
7a41d99a 274 addi 3,3,4
a8870a61
UD
2752: /* Move 2-3 bytes. */
276 bf 30,1f
277 lhz 6,0(12)
7a41d99a 278 sth 6,0(3)
a8870a61
UD
279 bf 31,0f
280 lbz 7,2(12)
281 stb 7,2(3)
282 ld 3,-16(1)
283 blr
2841: /* Move 1 byte. */
285 bf 31,0f
286 lbz 6,0(12)
287 stb 6,0(3)
2880:
289 /* Return original dst pointer. */
290 ld 3,-16(1)
291 blr
292
293/* Special case to copy 0-8 bytes. */
294 .align 4
295.LE8:
296 mr 12,4
297 bne cr6,4f
298/* Would have liked to use use ld/std here but the 630 processors are
7a41d99a 299 slow for load/store doubles that are not at least word aligned.
2ccdea26 300 Unaligned Load/Store word execute with only a 1 cycle penalty. */
a8870a61
UD
301 lwz 6,0(4)
302 lwz 7,4(4)
303 stw 6,0(3)
304 stw 7,4(3)
305 /* Return original dst pointer. */
306 ld 3,-16(1)
307 blr
308 .align 4
3094: bf 29,2b
310 lwz 6,0(4)
311 stw 6,0(3)
3126:
313 bf 30,5f
314 lhz 7,4(4)
7a41d99a 315 sth 7,4(3)
a8870a61
UD
316 bf 31,0f
317 lbz 8,6(4)
318 stb 8,6(3)
319 ld 3,-16(1)
320 blr
321 .align 4
7a41d99a 3225:
a8870a61
UD
323 bf 31,0f
324 lbz 6,4(4)
325 stb 6,4(3)
326 .align 4
3270:
328 /* Return original dst pointer. */
329 ld 3,-16(1)
a14b373c
UD
330 blr
331
7c3164bc 332 .align 4
a14b373c 333.L6:
a14b373c
UD
334
335 /* Copy doublewords where the destination is aligned but the source is
336 not. Use aligned doubleword loads from the source, shifted to realign
337 the data, to allow aligned destination stores. */
7c3164bc 338 subf 5,10,12
a8870a61
UD
339 andi. 0,9,1
340 cmpldi cr6,11,0
a14b373c 341 sldi 10,10,3
a8870a61 342 mr 11,9
7c3164bc
UD
343 mr 4,3
344 ld 6,0(5)
a14b373c
UD
345 ld 7,8(5)
346 subfic 9,10,64
347 beq 2f
759cfef3
AM
348#ifdef __LITTLE_ENDIAN__
349 srd 0,6,10
350#else
a14b373c 351 sld 0,6,10
759cfef3 352#endif
a8870a61 353 cmpldi 11,1
a14b373c
UD
354 mr 6,7
355 addi 4,4,-8
a8870a61 356 addi 11,11,-1
a14b373c
UD
357 b 1f
3582: addi 5,5,8
359 .align 4
759cfef3
AM
360#ifdef __LITTLE_ENDIAN__
3610: srd 0,6,10
362 sld 8,7,9
363#else
a14b373c
UD
3640: sld 0,6,10
365 srd 8,7,9
759cfef3 366#endif
a8870a61 367 cmpldi 11,2
a14b373c
UD
368 ld 6,8(5)
369 or 0,0,8
a8870a61 370 addi 11,11,-2
a14b373c 371 std 0,0(4)
759cfef3
AM
372#ifdef __LITTLE_ENDIAN__
373 srd 0,7,10
3741: sld 8,6,9
375#else
a14b373c
UD
376 sld 0,7,10
3771: srd 8,6,9
759cfef3 378#endif
a14b373c
UD
379 or 0,0,8
380 beq 8f
381 ld 7,16(5)
382 std 0,8(4)
383 addi 5,5,16
384 addi 4,4,16
385 b 0b
a8870a61 386 .align 4
a14b373c
UD
3878:
388 std 0,8(4)
a8870a61
UD
389 rldicr 0,31,0,60
390 mtcrf 0x01,31
391 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
392 /* Return original dst pointer. */
393 ld 31,-8(1)
394 ld 3,-16(1)
395 blr
72fd128a 396END_GEN_TB (MEMCPY,TB_TOCLESS)
85dd1003 397libc_hidden_builtin_def (memcpy)