]>
Commit | Line | Data |
---|---|---|
33b8d90a | 1 | /* Optimized memset implementation for PowerPC64/POWER7. |
568035b7 | 2 | Copyright (C) 2010-2013 Free Software Foundation, Inc. |
33b8d90a LM |
3 | Contributed by Luis Machado <luisgpm@br.ibm.com>. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
33b8d90a LM |
19 | |
20 | #include <sysdep.h> | |
33b8d90a LM |
21 | |
22 | /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); | |
23 | Returns 's'. */ | |
24 | ||
25 | .machine power7 | |
2d67d91a | 26 | EALIGN (memset, 5, 0) |
33b8d90a LM |
27 | CALL_MCOUNT 3 |
28 | ||
29 | L(_memset): | |
30 | cmpldi cr7,5,31 | |
31 | cmpldi cr6,5,8 | |
32 | mr 10,3 | |
33 | ||
34 | /* Replicate byte to word. */ | |
3be87c77 AM |
35 | insrdi 4,4,8,48 |
36 | insrdi 4,4,16,32 | |
33b8d90a LM |
37 | ble cr6,L(small) /* If length <= 8, use short copy code. */ |
38 | ||
39 | neg 0,3 | |
40 | ble cr7,L(medium) /* If length < 32, use medium copy code. */ | |
41 | ||
42 | andi. 11,10,7 /* Check alignment of SRC. */ | |
43 | insrdi 4,4,32,0 /* Replicate word to double word. */ | |
44 | ||
45 | mr 12,5 | |
46 | beq L(big_aligned) | |
47 | ||
48 | clrldi 0,0,61 | |
49 | mtocrf 0x01,0 | |
50 | subf 5,0,5 | |
51 | ||
52 | /* Get DST aligned to 8 bytes. */ | |
53 | 1: bf 31,2f | |
54 | ||
55 | stb 4,0(10) | |
56 | addi 10,10,1 | |
57 | 2: bf 30,4f | |
58 | ||
59 | sth 4,0(10) | |
60 | addi 10,10,2 | |
61 | 4: bf 29,L(big_aligned) | |
62 | ||
63 | stw 4,0(10) | |
64 | addi 10,10,4 | |
65 | ||
66 | .align 4 | |
67 | L(big_aligned): | |
68 | ||
69 | cmpldi cr5,5,255 | |
70 | li 0,32 | |
71 | dcbtst 0,10 | |
72 | cmpldi cr6,4,0 | |
73 | srdi 9,5,3 /* Number of full doublewords remaining. */ | |
74 | crand 27,26,21 | |
75 | mtocrf 0x01,9 | |
76 | bt 27,L(huge) | |
77 | ||
78 | /* From this point on, we'll copy 32+ bytes and the value | |
79 | isn't 0 (so we can't use dcbz). */ | |
80 | ||
81 | srdi 8,5,5 | |
82 | clrldi 11,5,61 | |
83 | cmpldi cr6,11,0 | |
84 | cmpldi cr1,9,4 | |
85 | mtctr 8 | |
86 | ||
87 | /* Copy 1~3 doublewords so the main loop starts | |
88 | at a multiple of 32 bytes. */ | |
89 | ||
90 | bf 30,1f | |
91 | ||
92 | std 4,0(10) | |
93 | std 4,8(10) | |
94 | addi 10,10,16 | |
95 | bf 31,L(big_loop) | |
96 | ||
97 | std 4,0(10) | |
98 | addi 10,10,8 | |
99 | mr 12,10 | |
100 | blt cr1,L(tail_bytes) | |
101 | b L(big_loop) | |
102 | ||
103 | .align 4 | |
104 | 1: /* Copy 1 doubleword. */ | |
105 | bf 31,L(big_loop) | |
106 | ||
107 | std 4,0(10) | |
108 | addi 10,10,8 | |
109 | ||
110 | /* Main aligned copy loop. Copies 32-bytes at a time and | |
111 | ping-pong through r10 and r12 to avoid AGEN delays. */ | |
112 | .align 4 | |
113 | L(big_loop): | |
114 | addi 12,10,32 | |
115 | std 4,0(10) | |
116 | std 4,8(10) | |
117 | std 4,16(10) | |
118 | std 4,24(10) | |
119 | bdz L(tail_bytes) | |
120 | ||
121 | addi 10,10,64 | |
122 | std 4,0(12) | |
123 | std 4,8(12) | |
124 | std 4,16(12) | |
125 | std 4,24(12) | |
126 | bdnz L(big_loop) | |
127 | ||
128 | mr 12,10 | |
129 | b L(tail_bytes) | |
130 | ||
131 | .align 4 | |
132 | L(tail_bytes): | |
133 | ||
134 | /* Check for tail bytes. */ | |
135 | beqlr cr6 | |
136 | ||
137 | clrldi 0,5,61 | |
138 | mtocrf 0x01,0 | |
139 | ||
140 | /* At this point we have a tail of 0-7 bytes and we know that the | |
141 | destination is doubleword-aligned. */ | |
142 | 4: /* Copy 4 bytes. */ | |
143 | bf 29,2f | |
144 | ||
145 | stw 4,0(12) | |
146 | addi 12,12,4 | |
147 | 2: /* Copy 2 bytes. */ | |
148 | bf 30,1f | |
149 | ||
150 | sth 4,0(12) | |
151 | addi 12,12,2 | |
152 | 1: /* Copy 1 byte. */ | |
153 | bflr 31 | |
154 | ||
155 | stb 4,0(12) | |
156 | blr | |
157 | ||
158 | /* Special case when value is 0 and we have a long length to deal | |
159 | with. Use dcbz to zero out 128-bytes at a time. Before using | |
160 | dcbz though, we need to get the destination 128-bytes aligned. */ | |
161 | .align 4 | |
162 | L(huge): | |
163 | andi. 11,10,127 | |
164 | neg 0,10 | |
165 | beq L(huge_aligned) | |
166 | ||
167 | clrldi 0,0,57 | |
168 | subf 5,0,5 | |
169 | srdi 0,0,3 | |
170 | mtocrf 0x01,0 | |
171 | ||
172 | /* Get DST aligned to 128 bytes. */ | |
173 | 8: bf 28,4f | |
174 | ||
175 | std 4,0(10) | |
176 | std 4,8(10) | |
177 | std 4,16(10) | |
178 | std 4,24(10) | |
179 | std 4,32(10) | |
180 | std 4,40(10) | |
181 | std 4,48(10) | |
182 | std 4,56(10) | |
183 | addi 10,10,64 | |
184 | .align 4 | |
185 | 4: bf 29,2f | |
186 | ||
187 | std 4,0(10) | |
188 | std 4,8(10) | |
189 | std 4,16(10) | |
190 | std 4,24(10) | |
191 | addi 10,10,32 | |
192 | .align 4 | |
193 | 2: bf 30,1f | |
194 | ||
195 | std 4,0(10) | |
196 | std 4,8(10) | |
197 | addi 10,10,16 | |
198 | .align 4 | |
199 | 1: bf 31,L(huge_aligned) | |
200 | ||
201 | std 4,0(10) | |
202 | addi 10,10,8 | |
203 | ||
204 | ||
205 | L(huge_aligned): | |
206 | srdi 8,5,7 | |
207 | clrldi 11,5,57 | |
208 | cmpldi cr6,11,0 | |
209 | mtctr 8 | |
210 | ||
211 | .align 4 | |
212 | L(huge_loop): | |
213 | dcbz 0,10 | |
214 | addi 10,10,128 | |
215 | bdnz L(huge_loop) | |
216 | ||
217 | /* Check how many bytes are still left. */ | |
218 | beqlr cr6 | |
219 | ||
220 | subf 9,3,10 | |
221 | subf 5,9,12 | |
222 | srdi 8,5,3 | |
223 | cmpldi cr6,8,0 | |
224 | mtocrf 0x01,8 | |
225 | ||
226 | /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for | |
227 | speed. We'll handle the resulting tail bytes later. */ | |
228 | beq cr6,L(tail) | |
229 | ||
230 | 8: bf 28,4f | |
231 | ||
232 | std 4,0(10) | |
233 | std 4,8(10) | |
234 | std 4,16(10) | |
235 | std 4,24(10) | |
236 | std 4,32(10) | |
237 | std 4,40(10) | |
238 | std 4,48(10) | |
239 | std 4,56(10) | |
240 | addi 10,10,64 | |
241 | .align 4 | |
242 | 4: bf 29,2f | |
243 | ||
244 | std 4,0(10) | |
245 | std 4,8(10) | |
246 | std 4,16(10) | |
247 | std 4,24(10) | |
248 | addi 10,10,32 | |
249 | .align 4 | |
250 | 2: bf 30,1f | |
251 | ||
252 | std 4,0(10) | |
253 | std 4,8(10) | |
254 | addi 10,10,16 | |
255 | .align 4 | |
256 | 1: bf 31,L(tail) | |
257 | ||
258 | std 4,0(10) | |
259 | addi 10,10,8 | |
260 | ||
261 | /* Handle the rest of the tail bytes here. */ | |
262 | L(tail): | |
263 | mtocrf 0x01,5 | |
264 | ||
265 | .align 4 | |
266 | 4: bf 29,2f | |
267 | ||
268 | stw 4,0(10) | |
269 | addi 10,10,4 | |
270 | .align 4 | |
271 | 2: bf 30,1f | |
272 | ||
273 | sth 4,0(10) | |
274 | addi 10,10,2 | |
275 | .align 4 | |
276 | 1: bflr 31 | |
277 | ||
278 | stb 4,0(10) | |
279 | blr | |
280 | ||
281 | /* Expanded tree to copy tail bytes without increments. */ | |
282 | .align 4 | |
283 | L(copy_tail): | |
284 | bf 29,L(FXX) | |
285 | ||
286 | stw 4,0(10) | |
287 | bf 30,L(TFX) | |
288 | ||
289 | sth 4,4(10) | |
290 | bflr 31 | |
291 | ||
292 | stb 4,6(10) | |
293 | blr | |
294 | ||
295 | .align 4 | |
296 | L(FXX): bf 30,L(FFX) | |
297 | ||
298 | sth 4,0(10) | |
299 | bflr 31 | |
300 | ||
301 | stb 4,2(10) | |
302 | blr | |
303 | ||
304 | .align 4 | |
305 | L(TFX): bflr 31 | |
306 | ||
307 | stb 4,4(10) | |
308 | blr | |
309 | ||
310 | .align 4 | |
311 | L(FFX): bflr 31 | |
312 | ||
313 | stb 4,0(10) | |
314 | blr | |
315 | ||
316 | /* Handle copies of 9~31 bytes. */ | |
317 | .align 4 | |
318 | L(medium): | |
319 | /* At least 9 bytes to go. */ | |
320 | andi. 11,10,3 | |
321 | clrldi 0,0,62 | |
322 | beq L(medium_aligned) | |
323 | ||
3be87c77 | 324 | /* Force 4-bytes alignment for DST. */ |
33b8d90a LM |
325 | mtocrf 0x01,0 |
326 | subf 5,0,5 | |
327 | 1: /* Copy 1 byte. */ | |
328 | bf 31,2f | |
329 | ||
330 | stb 4,0(10) | |
331 | addi 10,10,1 | |
332 | 2: /* Copy 2 bytes. */ | |
333 | bf 30,L(medium_aligned) | |
334 | ||
335 | sth 4,0(10) | |
336 | addi 10,10,2 | |
337 | ||
338 | .align 4 | |
339 | L(medium_aligned): | |
340 | /* At least 6 bytes to go, and DST is word-aligned. */ | |
341 | cmpldi cr1,5,16 | |
342 | mtocrf 0x01,5 | |
343 | blt cr1,8f | |
344 | ||
345 | /* Copy 16 bytes. */ | |
346 | stw 4,0(10) | |
347 | stw 4,4(10) | |
348 | stw 4,8(10) | |
349 | stw 4,12(10) | |
350 | addi 10,10,16 | |
351 | 8: /* Copy 8 bytes. */ | |
352 | bf 28,4f | |
353 | ||
354 | stw 4,0(10) | |
355 | stw 4,4(10) | |
356 | addi 10,10,8 | |
357 | 4: /* Copy 4 bytes. */ | |
358 | bf 29,2f | |
359 | ||
360 | stw 4,0(10) | |
361 | addi 10,10,4 | |
362 | 2: /* Copy 2-3 bytes. */ | |
363 | bf 30,1f | |
364 | ||
365 | sth 4,0(10) | |
366 | addi 10,10,2 | |
367 | 1: /* Copy 1 byte. */ | |
368 | bflr 31 | |
369 | ||
370 | stb 4,0(10) | |
371 | blr | |
372 | ||
373 | /* Handles copies of 0~8 bytes. */ | |
374 | .align 4 | |
375 | L(small): | |
376 | mtocrf 0x01,5 | |
377 | bne cr6,L(copy_tail) | |
378 | ||
379 | stw 4,0(10) | |
380 | stw 4,4(10) | |
381 | blr | |
382 | ||
2d67d91a | 383 | END_GEN_TB (memset,TB_TOCLESS) |
33b8d90a LM |
384 | libc_hidden_builtin_def (memset) |
385 | ||
8a29a3d0 | 386 | #ifndef NO_BZERO_IMPL |
33b8d90a LM |
387 | /* Copied from bzero.S to prevent the linker from inserting a stub |
388 | between bzero and memset. */ | |
2d67d91a | 389 | ENTRY (__bzero) |
33b8d90a LM |
390 | CALL_MCOUNT 3 |
391 | mr r5,r4 | |
392 | li r4,0 | |
393 | b L(_memset) | |
2d67d91a | 394 | END_GEN_TB (__bzero,TB_TOCLESS) |
33b8d90a | 395 | |
2d67d91a | 396 | weak_alias (__bzero, bzero) |
8a29a3d0 | 397 | #endif |