]>
Commit | Line | Data |
---|---|---|
a14b373c UD |
1 | /* Optimized memcpy implementation for PowerPC64. |
2 | Copyright (C) 2003 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, write to the Free | |
17 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
18 | 02111-1307 USA. */ | |
19 | ||
20 | #include <sysdep.h> | |
21 | #include <bp-sym.h> | |
22 | #include <bp-asm.h> | |
23 | ||
24 | /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); | |
25 | Returns 'dst'. | |
26 | ||
27 | Memcpy handles short copies (< 32-bytes) using an unaligned | |
28 | word lwz/stw loop. The tail (remaining 1-3) bytes is handled with the | |
29 | appropriate combination of byte and halfword load/stores. There is no | |
30 | attempt to optimize the alignment of short moves. The 64-bit | |
31 | implementations of POWER3 and POWER4 do a reasonable job of handling | |
32 | unligned load/stores that do not cross 32-byte boundries. | |
33 | ||
34 | Longer moves (>= 32-bytes) justify the effort to get at least the | |
35 | destination doubleword (8-byte) aligned. Further optimization is | |
36 | posible when both source and destination are doubleword aligned. | |
37 | Each case has a optimized unrolled loop. */ | |
38 | ||
39 | EALIGN (BP_SYM (memcpy), 5, 0) | |
40 | cmpldi cr1,5,31 | |
41 | neg 0,3 | |
42 | std 30,-16(1) | |
43 | std 31,-8(1) | |
44 | rldicl. 0,0,0,61 | |
45 | mr 12,4 | |
46 | mr 31,5 | |
47 | mr 30,3 | |
48 | ble- cr1,.L2 | |
49 | subf 31,0,5 | |
50 | ||
51 | /* Move 0-7 bytes as needed to get the destination doubleword alligned. */ | |
52 | beq 0f | |
53 | mtcrf 0x01,0 | |
54 | 1: bf 31,2f | |
55 | lbz 6,0(12) | |
56 | addi 12,12,1 | |
57 | stb 6,0(3) | |
58 | addi 3,3,1 | |
59 | 2: bf 30,4f | |
60 | lhz 6,0(12) | |
61 | addi 12,12,2 | |
62 | sth 6,0(3) | |
63 | addi 3,3,2 | |
64 | 4: bf 29,0f | |
65 | lwz 6,0(12) | |
66 | addi 12,12,4 | |
67 | stw 6,0(3) | |
68 | addi 3,3,4 | |
69 | 0: | |
70 | /* Copy doublewords from source to destination, assumpting the | |
71 | destination is aligned on a doubleword boundary. | |
72 | ||
73 | First verify that there is > 7 bytes to copy and check if the source | |
74 | is also doubleword aligned. If there are < 8 bytes to copy fall | |
75 | through to the tail byte copy code. Otherwise if the source and | |
76 | destination are both doubleword aligned use an optimized doubleword | |
77 | copy loop. Otherwise the source has a different alignment and we use | |
78 | a load, shift, store strategy. */ | |
79 | rldicl. 0,12,0,61 | |
80 | cmpldi cr6,31,7 | |
81 | ble- cr6,.L2 /* less than 8 bytes left. */ | |
7c3164bc UD |
82 | srdi 11,31,3 |
83 | andi. 10,12,7 | |
a14b373c UD |
84 | bne- 0,.L6 /* Source is not DW aligned. */ |
85 | srdi. 9,31,3 | |
86 | mr 10,3 | |
87 | mr 11,12 | |
88 | ||
89 | /* Move doublewords where destination and source are aligned. | |
90 | Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration. | |
91 | If the remainder is >0 and < 32 bytes copy 1-3 doublewords. */ | |
92 | cmpldi cr1,9,4 | |
93 | beq 0f | |
94 | mtcrf 0x01,9 | |
95 | blt cr1,2f | |
96 | ld 6,0(11) | |
97 | .align 4 | |
98 | 4: | |
99 | ld 7,8(11) | |
100 | addi 9,9,-4 | |
101 | std 6,0(10) | |
102 | ld 6,16(11) | |
103 | std 7,8(10) | |
104 | ld 7,24(11) | |
105 | addi 11,11,32 | |
106 | cmpldi cr1,9,4 | |
107 | std 6,16(10) | |
108 | blt cr1,3f | |
109 | ld 6,0(11) | |
110 | std 7,24(10) | |
111 | addi 10,10,32 | |
112 | b 4b | |
113 | 3: std 7,24(10) | |
114 | addi 10,10,32 | |
115 | 2: bf 30,1f | |
116 | ld 6,0(11) | |
117 | ld 7,8(11) | |
118 | addi 11,11,16 | |
119 | std 6,0(10) | |
120 | std 7,8(10) | |
121 | addi 10,10,16 | |
122 | 1: bf 31,0f | |
123 | ld 6,0(11) | |
124 | addi 11,11,8 | |
125 | std 6,0(10) | |
126 | addi 10,10,8 | |
127 | 0: | |
128 | ||
129 | .L8: | |
130 | rldicr 0,31,0,60 | |
131 | rldicl 31,31,0,61 | |
132 | add 3,3,0 | |
133 | add 12,12,0 | |
134 | ||
135 | /* Copy the tail for up to 31 bytes. If this is the tail of a longer | |
136 | copy then the destination will be aligned and the length will be | |
137 | less than 8. So it is normally not worth the set-up overhead to | |
138 | get doubleword aligned and do doubleword load/store. */ | |
139 | .L2: | |
140 | mr. 10,31 | |
141 | cmpldi cr1,31,4 | |
142 | beq 0f | |
143 | mtcrf 0x01,31 | |
144 | blt cr1,2f | |
145 | 4: lwz 6,0(12) | |
146 | addi 12,12,4 | |
147 | addi 10,10,-4 | |
148 | stw 6,0(3) | |
149 | cmpldi cr1,10,4 | |
150 | addi 3,3,4 | |
151 | bge cr1,4b | |
152 | 2: bf 30,1f | |
153 | lhz 6,0(12) | |
154 | addi 12,12,2 | |
155 | sth 6,0(3) | |
156 | addi 3,3,2 | |
157 | 1: bf 31,0f | |
158 | lbz 6,0(12) | |
159 | addi 12,12,1 | |
160 | stb 6,0(3) | |
161 | addi 3,3,1 | |
162 | 0: | |
163 | /* Return original dst pointer. */ | |
164 | ld 31,-8(1) | |
165 | mr 3,30 | |
166 | ld 30,-16(1) | |
167 | blr | |
168 | ||
7c3164bc | 169 | .align 4 |
a14b373c | 170 | .L6: |
a14b373c UD |
171 | |
172 | /* Copy doublewords where the destination is aligned but the source is | |
173 | not. Use aligned doubleword loads from the source, shifted to realign | |
174 | the data, to allow aligned destination stores. */ | |
7c3164bc | 175 | subf 5,10,12 |
a14b373c | 176 | andi. 0,11,1 |
a14b373c | 177 | sldi 10,10,3 |
7c3164bc UD |
178 | mr 4,3 |
179 | ld 6,0(5) | |
a14b373c UD |
180 | ld 7,8(5) |
181 | subfic 9,10,64 | |
182 | beq 2f | |
183 | sld 0,6,10 | |
184 | addi 11,11,-1 | |
185 | mr 6,7 | |
186 | addi 4,4,-8 | |
187 | cmpldi 11,0 | |
188 | b 1f | |
189 | 2: addi 5,5,8 | |
190 | .align 4 | |
191 | 0: sld 0,6,10 | |
192 | srd 8,7,9 | |
193 | addi 11,11,-2 | |
194 | ld 6,8(5) | |
195 | or 0,0,8 | |
196 | cmpldi 11,0 | |
197 | std 0,0(4) | |
198 | sld 0,7,10 | |
199 | 1: srd 8,6,9 | |
200 | or 0,0,8 | |
201 | beq 8f | |
202 | ld 7,16(5) | |
203 | std 0,8(4) | |
204 | addi 5,5,16 | |
205 | addi 4,4,16 | |
206 | b 0b | |
207 | 8: | |
208 | std 0,8(4) | |
209 | b .L8 | |
210 | END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) | |
85dd1003 | 211 | libc_hidden_builtin_def (memcpy) |