]>
Commit | Line | Data |
---|---|---|
33b8d90a | 1 | /* Optimized memset implementation for PowerPC32/POWER7. |
688903eb | 2 | Copyright (C) 2010-2018 Free Software Foundation, Inc. |
33b8d90a LM |
3 | Contributed by Luis Machado <luisgpm@br.ibm.com>. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
33b8d90a LM |
19 | |
20 | #include <sysdep.h> | |
33b8d90a | 21 | |
f17a4233 | 22 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
33b8d90a LM |
23 | Returns 's'. */ |
24 | ||
25 | .machine power7 | |
b5510883 | 26 | EALIGN (memset, 5, 0) |
33b8d90a LM |
27 | CALL_MCOUNT |
28 | ||
29 | .align 4 | |
30 | L(_memset): | |
31 | cmplwi cr7,5,31 | |
32 | cmplwi cr6,5,8 | |
33 | mr 10,3 /* Save original argument for later. */ | |
34 | mr 7,1 /* Save original r1 for later. */ | |
35 | cfi_offset(31,-8) | |
36 | ||
37 | /* Replicate byte to word. */ | |
d298c416 AZ |
38 | insrwi 4,4,8,16 |
39 | insrwi 4,4,16,0 | |
33b8d90a LM |
40 | |
41 | ble cr6,L(small) /* If length <= 8, use short copy code. */ | |
42 | ||
43 | neg 0,3 | |
44 | ble cr7,L(medium) /* If length < 32, use medium copy code. */ | |
45 | ||
46 | /* Save our word twice to create a doubleword that we will later | |
47 | copy to a FPR. */ | |
48 | stwu 1,-32(1) | |
49 | andi. 11,10,7 /* Check alignment of DST. */ | |
50 | mr 12,5 | |
51 | stw 4,24(1) | |
52 | stw 4,28(1) | |
53 | beq L(big_aligned) | |
54 | ||
55 | clrlwi 0,0,29 | |
56 | mtocrf 0x01,0 | |
57 | subf 5,0,5 | |
58 | ||
59 | /* Get DST aligned to 8 bytes. */ | |
60 | 1: bf 31,2f | |
61 | ||
62 | stb 4,0(10) | |
63 | addi 10,10,1 | |
64 | 2: bf 30,4f | |
65 | ||
66 | sth 4,0(10) | |
67 | addi 10,10,2 | |
68 | 4: bf 29,L(big_aligned) | |
69 | ||
70 | stw 4,0(10) | |
71 | addi 10,10,4 | |
72 | ||
73 | .align 4 | |
74 | L(big_aligned): | |
75 | cmplwi cr5,5,255 | |
76 | li 0,32 | |
77 | cmplwi cr1,5,160 | |
78 | dcbtst 0,10 | |
79 | cmplwi cr6,4,0 | |
80 | srwi 9,5,3 /* Number of full doublewords remaining. */ | |
81 | crand 27,26,21 | |
82 | mtocrf 0x01,9 | |
83 | bt 27,L(huge) | |
84 | ||
85 | /* From this point on, we'll copy 32+ bytes and the value | |
86 | isn't 0 (so we can't use dcbz). */ | |
87 | ||
88 | srwi 8,5,5 | |
89 | clrlwi 11,5,29 | |
90 | cmplwi cr6,11,0 | |
91 | cmplwi cr1,9,4 | |
92 | mtctr 8 | |
93 | ||
94 | /* Copy 1~3 doublewords so the main loop starts | |
95 | at a multiple of 32 bytes. */ | |
96 | ||
97 | bf 30,1f | |
98 | ||
99 | stw 4,0(10) | |
100 | stw 4,4(10) | |
101 | stw 4,8(10) | |
102 | stw 4,12(10) | |
103 | addi 10,10,16 | |
104 | bf 31,L(big_loop) | |
105 | ||
106 | stw 4,0(10) | |
107 | stw 4,4(10) | |
108 | addi 10,10,8 | |
109 | mr 12,10 | |
110 | blt cr1,L(tail_bytes) | |
111 | ||
112 | b L(big_loop) | |
113 | ||
114 | .align 4 | |
115 | 1: /* Copy 1 doubleword. */ | |
116 | bf 31,L(big_loop) | |
117 | ||
118 | stw 4,0(10) | |
119 | stw 4,4(10) | |
120 | addi 10,10,8 | |
121 | ||
122 | /* First use a 32-bytes loop with stw's to try and avoid the LHS due | |
123 | to the lfd we will do next. Also, ping-pong through r10 and r12 | |
124 | to avoid AGEN delays. */ | |
125 | .align 4 | |
126 | L(big_loop): | |
127 | addi 12,10,32 | |
128 | stw 4,0(10) | |
129 | stw 4,4(10) | |
130 | stw 4,8(10) | |
131 | stw 4,12(10) | |
132 | stw 4,16(10) | |
133 | stw 4,20(10) | |
134 | stw 4,24(10) | |
135 | stw 4,28(10) | |
136 | bdz L(tail_bytes) | |
137 | ||
138 | addi 10,10,64 | |
139 | stw 4,0(12) | |
140 | stw 4,4(12) | |
141 | stw 4,8(12) | |
142 | stw 4,12(12) | |
143 | stw 4,16(12) | |
144 | stw 4,20(12) | |
145 | stw 4,24(12) | |
146 | stw 4,28(12) | |
147 | bdnz L(big_loop_fast_setup) | |
148 | ||
149 | mr 12,10 | |
150 | b L(tail_bytes) | |
151 | ||
152 | /* Now that we're probably past the LHS window, use the VSX to | |
153 | speed up the loop. */ | |
154 | L(big_loop_fast_setup): | |
33b8d90a LM |
155 | li 11,24 |
156 | li 6,16 | |
157 | lxvdsx 4,1,11 | |
158 | ||
159 | .align 4 | |
160 | L(big_loop_fast): | |
161 | addi 12,10,32 | |
ebd2e13d | 162 | stxvd2x 4,0,10 |
33b8d90a LM |
163 | stxvd2x 4,10,6 |
164 | bdz L(tail_bytes) | |
165 | ||
166 | addi 10,10,64 | |
ebd2e13d | 167 | stxvd2x 4,0,12 |
33b8d90a LM |
168 | stxvd2x 4,12,6 |
169 | bdnz L(big_loop_fast) | |
170 | ||
171 | mr 12,10 | |
172 | ||
173 | .align 4 | |
174 | L(tail_bytes): | |
175 | ||
176 | /* Check for tail bytes. */ | |
177 | mr 1,7 /* Restore r1. */ | |
178 | beqlr cr6 | |
179 | ||
180 | clrlwi 0,5,29 | |
181 | mtocrf 0x01,0 | |
182 | ||
183 | /* At this point we have a tail of 0-7 bytes and we know that the | |
184 | destination is doubleword-aligned. */ | |
185 | 4: /* Copy 4 bytes. */ | |
186 | bf 29,2f | |
187 | ||
188 | stw 4,0(12) | |
189 | addi 12,12,4 | |
190 | 2: /* Copy 2 bytes. */ | |
191 | bf 30,1f | |
192 | ||
193 | sth 4,0(12) | |
194 | addi 12,12,2 | |
195 | 1: /* Copy 1 byte. */ | |
196 | bflr 31 | |
197 | ||
198 | stb 4,0(12) | |
199 | blr | |
200 | ||
201 | ||
202 | /* Special case when value is 0 and we have a long length to deal | |
203 | with. Use dcbz to zero out 128-bytes at a time. Before using | |
204 | dcbz though, we need to get the destination 128-bytes aligned. */ | |
205 | .align 4 | |
206 | L(huge): | |
207 | lfd 4,24(1) | |
208 | andi. 11,10,127 | |
209 | neg 0,10 | |
210 | beq L(huge_aligned) | |
211 | ||
212 | clrlwi 0,0,25 | |
213 | subf 5,0,5 | |
214 | srwi 0,0,3 | |
215 | mtocrf 0x01,0 | |
216 | ||
217 | /* Get DST aligned to 128 bytes. */ | |
218 | 8: bf 28,4f | |
219 | ||
220 | stfd 4,0(10) | |
221 | stfd 4,8(10) | |
222 | stfd 4,16(10) | |
223 | stfd 4,24(10) | |
224 | stfd 4,32(10) | |
225 | stfd 4,40(10) | |
226 | stfd 4,48(10) | |
227 | stfd 4,56(10) | |
228 | addi 10,10,64 | |
229 | .align 4 | |
230 | 4: bf 29,2f | |
231 | ||
232 | stfd 4,0(10) | |
233 | stfd 4,8(10) | |
234 | stfd 4,16(10) | |
235 | stfd 4,24(10) | |
236 | addi 10,10,32 | |
237 | .align 4 | |
238 | 2: bf 30,1f | |
239 | ||
240 | stfd 4,0(10) | |
241 | stfd 4,8(10) | |
242 | addi 10,10,16 | |
243 | .align 4 | |
244 | 1: bf 31,L(huge_aligned) | |
245 | ||
246 | stfd 4,0(10) | |
247 | addi 10,10,8 | |
248 | ||
249 | L(huge_aligned): | |
250 | srwi 8,5,7 | |
251 | clrlwi 11,5,25 | |
252 | cmplwi cr6,11,0 | |
253 | mtctr 8 | |
254 | ||
255 | /* Copies 128-bytes at a time. */ | |
256 | .align 4 | |
257 | L(huge_loop): | |
258 | dcbz 0,10 | |
259 | addi 10,10,128 | |
260 | bdnz L(huge_loop) | |
261 | ||
262 | /* We have a tail of 0~127 bytes to handle. */ | |
263 | mr 1,7 /* Restore r1. */ | |
264 | beqlr cr6 | |
265 | ||
266 | subf 9,3,10 | |
267 | subf 5,9,12 | |
268 | srwi 8,5,3 | |
269 | cmplwi cr6,8,0 | |
270 | mtocrf 0x01,8 | |
271 | ||
272 | /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for | |
273 | speed. We'll handle the resulting tail bytes later. */ | |
274 | beq cr6,L(tail) | |
275 | ||
276 | 8: bf 28,4f | |
277 | ||
278 | stfd 4,0(10) | |
279 | stfd 4,8(10) | |
280 | stfd 4,16(10) | |
281 | stfd 4,24(10) | |
282 | stfd 4,32(10) | |
283 | stfd 4,40(10) | |
284 | stfd 4,48(10) | |
285 | stfd 4,56(10) | |
286 | addi 10,10,64 | |
287 | .align 4 | |
288 | 4: bf 29,2f | |
289 | ||
290 | stfd 4,0(10) | |
291 | stfd 4,8(10) | |
292 | stfd 4,16(10) | |
293 | stfd 4,24(10) | |
294 | addi 10,10,32 | |
295 | .align 4 | |
296 | 2: bf 30,1f | |
297 | ||
298 | stfd 4,0(10) | |
299 | stfd 4,8(10) | |
300 | addi 10,10,16 | |
301 | .align 4 | |
302 | 1: bf 31,L(tail) | |
303 | ||
304 | stfd 4,0(10) | |
305 | addi 10,10,8 | |
306 | ||
307 | /* Handle the rest of the tail bytes here. */ | |
308 | L(tail): | |
309 | mtocrf 0x01,5 | |
310 | ||
311 | .align 4 | |
312 | 4: bf 29,2f | |
313 | ||
314 | stw 4,0(10) | |
315 | addi 10,10,4 | |
316 | .align 4 | |
317 | 2: bf 30,1f | |
318 | ||
319 | sth 4,0(10) | |
320 | addi 10,10,2 | |
321 | .align 4 | |
322 | 1: bflr 31 | |
323 | ||
324 | stb 4,0(10) | |
325 | blr | |
326 | ||
327 | ||
328 | /* Expanded tree to copy tail bytes without increments. */ | |
329 | .align 4 | |
330 | L(copy_tail): | |
331 | bf 29,L(FXX) | |
332 | ||
333 | stw 4,0(10) | |
334 | bf 30,L(TFX) | |
335 | ||
336 | sth 4,4(10) | |
337 | bflr 31 | |
338 | ||
339 | stb 4,6(10) | |
340 | blr | |
341 | ||
342 | .align 4 | |
343 | L(FXX): bf 30,L(FFX) | |
344 | ||
345 | sth 4,0(10) | |
346 | bflr 31 | |
347 | ||
348 | stb 4,2(10) | |
349 | blr | |
350 | ||
351 | .align 4 | |
352 | L(TFX): bflr 31 | |
353 | ||
354 | stb 4,4(10) | |
355 | blr | |
356 | ||
357 | .align 4 | |
358 | L(FFX): bflr 31 | |
359 | ||
360 | stb 4,0(10) | |
361 | blr | |
362 | ||
363 | /* Handle copies of 9~31 bytes. */ | |
364 | .align 4 | |
365 | L(medium): | |
366 | /* At least 9 bytes to go. */ | |
367 | andi. 11,10,3 | |
368 | clrlwi 0,0,30 | |
369 | beq L(medium_aligned) | |
370 | ||
371 | /* Force 4-bytes alignment for DST. */ | |
372 | mtocrf 0x01,0 | |
373 | subf 5,0,5 | |
374 | 1: /* Copy 1 byte. */ | |
375 | bf 31,2f | |
376 | ||
377 | stb 4,0(10) | |
378 | addi 10,10,1 | |
379 | 2: /* Copy 2 bytes. */ | |
380 | bf 30,L(medium_aligned) | |
381 | ||
382 | sth 4,0(10) | |
383 | addi 10,10,2 | |
384 | ||
385 | .align 4 | |
386 | L(medium_aligned): | |
387 | /* At least 6 bytes to go, and DST is word-aligned. */ | |
388 | cmplwi cr1,5,16 | |
389 | mtocrf 0x01,5 | |
390 | blt cr1,8f | |
391 | ||
392 | /* Copy 16 bytes. */ | |
393 | stw 4,0(10) | |
394 | stw 4,4(10) | |
395 | stw 4,8(10) | |
396 | stw 4,12(10) | |
397 | addi 10,10,16 | |
398 | 8: /* Copy 8 bytes. */ | |
399 | bf 28,4f | |
400 | ||
401 | stw 4,0(10) | |
402 | stw 4,4(10) | |
403 | addi 10,10,8 | |
404 | 4: /* Copy 4 bytes. */ | |
405 | bf 29,2f | |
406 | ||
407 | stw 4,0(10) | |
408 | addi 10,10,4 | |
409 | 2: /* Copy 2-3 bytes. */ | |
410 | bf 30,1f | |
411 | ||
412 | sth 4,0(10) | |
413 | addi 10,10,2 | |
414 | 1: /* Copy 1 byte. */ | |
415 | bflr 31 | |
416 | ||
417 | stb 4,0(10) | |
418 | blr | |
419 | ||
420 | /* Handles copies of 0~8 bytes. */ | |
421 | .align 4 | |
422 | L(small): | |
423 | mtocrf 0x01,5 | |
424 | bne cr6,L(copy_tail) | |
425 | ||
426 | stw 4,0(10) | |
427 | stw 4,4(10) | |
428 | blr | |
429 | ||
b5510883 | 430 | END (memset) |
33b8d90a | 431 | libc_hidden_builtin_def (memset) |