]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/memcpy.S
[BZ #274]
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memcpy.S
CommitLineData
a14b373c
UD
1/* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
19
20#include <sysdep.h>
21#include <bp-sym.h>
22#include <bp-asm.h>
23
24/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
25 Returns 'dst'.
26
27 Memcpy handles short copies (< 32-bytes) using an unaligned
28 word lwz/stw loop. The tail (remaining 1-3) bytes is handled with the
29 appropriate combination of byte and halfword load/stores. There is no
30 attempt to optimize the alignment of short moves. The 64-bit
31 implementations of POWER3 and POWER4 do a reasonable job of handling
32 unligned load/stores that do not cross 32-byte boundries.
33
34 Longer moves (>= 32-bytes) justify the effort to get at least the
35 destination doubleword (8-byte) aligned. Further optimization is
36 posible when both source and destination are doubleword aligned.
37 Each case has a optimized unrolled loop. */
38
39EALIGN (BP_SYM (memcpy), 5, 0)
40 cmpldi cr1,5,31
41 neg 0,3
42 std 30,-16(1)
43 std 31,-8(1)
44 rldicl. 0,0,0,61
45 mr 12,4
46 mr 31,5
47 mr 30,3
48 ble- cr1,.L2
49 subf 31,0,5
50
51 /* Move 0-7 bytes as needed to get the destination doubleword alligned. */
52 beq 0f
53 mtcrf 0x01,0
541: bf 31,2f
55 lbz 6,0(12)
56 addi 12,12,1
57 stb 6,0(3)
58 addi 3,3,1
592: bf 30,4f
60 lhz 6,0(12)
61 addi 12,12,2
62 sth 6,0(3)
63 addi 3,3,2
644: bf 29,0f
65 lwz 6,0(12)
66 addi 12,12,4
67 stw 6,0(3)
68 addi 3,3,4
690:
70 /* Copy doublewords from source to destination, assumpting the
71 destination is aligned on a doubleword boundary.
72
73 First verify that there is > 7 bytes to copy and check if the source
74 is also doubleword aligned. If there are < 8 bytes to copy fall
75 through to the tail byte copy code. Otherwise if the source and
76 destination are both doubleword aligned use an optimized doubleword
77 copy loop. Otherwise the source has a different alignment and we use
78 a load, shift, store strategy. */
79 rldicl. 0,12,0,61
80 cmpldi cr6,31,7
81 ble- cr6,.L2 /* less than 8 bytes left. */
7c3164bc
UD
82 srdi 11,31,3
83 andi. 10,12,7
a14b373c
UD
84 bne- 0,.L6 /* Source is not DW aligned. */
85 srdi. 9,31,3
86 mr 10,3
87 mr 11,12
88
89 /* Move doublewords where destination and source are aligned.
90 Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
91 If the remainder is >0 and < 32 bytes copy 1-3 doublewords. */
92 cmpldi cr1,9,4
93 beq 0f
94 mtcrf 0x01,9
95 blt cr1,2f
96 ld 6,0(11)
97 .align 4
984:
99 ld 7,8(11)
100 addi 9,9,-4
101 std 6,0(10)
102 ld 6,16(11)
103 std 7,8(10)
104 ld 7,24(11)
105 addi 11,11,32
106 cmpldi cr1,9,4
107 std 6,16(10)
108 blt cr1,3f
109 ld 6,0(11)
110 std 7,24(10)
111 addi 10,10,32
112 b 4b
1133: std 7,24(10)
114 addi 10,10,32
1152: bf 30,1f
116 ld 6,0(11)
117 ld 7,8(11)
118 addi 11,11,16
119 std 6,0(10)
120 std 7,8(10)
121 addi 10,10,16
1221: bf 31,0f
123 ld 6,0(11)
124 addi 11,11,8
125 std 6,0(10)
126 addi 10,10,8
1270:
128
129.L8:
130 rldicr 0,31,0,60
131 rldicl 31,31,0,61
132 add 3,3,0
133 add 12,12,0
134
135 /* Copy the tail for up to 31 bytes. If this is the tail of a longer
136 copy then the destination will be aligned and the length will be
137 less than 8. So it is normally not worth the set-up overhead to
138 get doubleword aligned and do doubleword load/store. */
139.L2:
140 mr. 10,31
141 cmpldi cr1,31,4
142 beq 0f
143 mtcrf 0x01,31
144 blt cr1,2f
1454: lwz 6,0(12)
146 addi 12,12,4
147 addi 10,10,-4
148 stw 6,0(3)
149 cmpldi cr1,10,4
150 addi 3,3,4
151 bge cr1,4b
1522: bf 30,1f
153 lhz 6,0(12)
154 addi 12,12,2
155 sth 6,0(3)
156 addi 3,3,2
1571: bf 31,0f
158 lbz 6,0(12)
159 addi 12,12,1
160 stb 6,0(3)
161 addi 3,3,1
1620:
163 /* Return original dst pointer. */
164 ld 31,-8(1)
165 mr 3,30
166 ld 30,-16(1)
167 blr
168
7c3164bc 169 .align 4
a14b373c 170.L6:
a14b373c
UD
171
172 /* Copy doublewords where the destination is aligned but the source is
173 not. Use aligned doubleword loads from the source, shifted to realign
174 the data, to allow aligned destination stores. */
7c3164bc 175 subf 5,10,12
a14b373c 176 andi. 0,11,1
a14b373c 177 sldi 10,10,3
7c3164bc
UD
178 mr 4,3
179 ld 6,0(5)
a14b373c
UD
180 ld 7,8(5)
181 subfic 9,10,64
182 beq 2f
183 sld 0,6,10
184 addi 11,11,-1
185 mr 6,7
186 addi 4,4,-8
187 cmpldi 11,0
188 b 1f
1892: addi 5,5,8
190 .align 4
1910: sld 0,6,10
192 srd 8,7,9
193 addi 11,11,-2
194 ld 6,8(5)
195 or 0,0,8
196 cmpldi 11,0
197 std 0,0(4)
198 sld 0,7,10
1991: srd 8,6,9
200 or 0,0,8
201 beq 8f
202 ld 7,16(5)
203 std 0,8(4)
204 addi 5,5,16
205 addi 4,4,16
206 b 0b
2078:
208 std 0,8(4)
209 b .L8
210END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
85dd1003 211libc_hidden_builtin_def (memcpy)