]>
Commit | Line | Data |
---|---|---|
33b8d90a | 1 | /* Optimized memset implementation for PowerPC64/POWER7. |
688903eb | 2 | Copyright (C) 2010-2018 Free Software Foundation, Inc. |
33b8d90a LM |
3 | Contributed by Luis Machado <luisgpm@br.ibm.com>. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
33b8d90a LM |
19 | |
20 | #include <sysdep.h> | |
33b8d90a | 21 | |
f17a4233 | 22 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
33b8d90a LM |
23 | Returns 's'. */ |
24 | ||
18e0054b WSM |
25 | #ifndef MEMSET |
26 | # define MEMSET memset | |
27 | #endif | |
33b8d90a | 28 | .machine power7 |
d5b41185 | 29 | ENTRY_TOCLESS (MEMSET, 5) |
33b8d90a LM |
30 | CALL_MCOUNT 3 |
31 | ||
32 | L(_memset): | |
33 | cmpldi cr7,5,31 | |
34 | cmpldi cr6,5,8 | |
35 | mr 10,3 | |
36 | ||
37 | /* Replicate byte to word. */ | |
3be87c77 AM |
38 | insrdi 4,4,8,48 |
39 | insrdi 4,4,16,32 | |
33b8d90a LM |
40 | ble cr6,L(small) /* If length <= 8, use short copy code. */ |
41 | ||
42 | neg 0,3 | |
43 | ble cr7,L(medium) /* If length < 32, use medium copy code. */ | |
44 | ||
45 | andi. 11,10,7 /* Check alignment of SRC. */ | |
46 | insrdi 4,4,32,0 /* Replicate word to double word. */ | |
47 | ||
48 | mr 12,5 | |
49 | beq L(big_aligned) | |
50 | ||
51 | clrldi 0,0,61 | |
52 | mtocrf 0x01,0 | |
53 | subf 5,0,5 | |
54 | ||
55 | /* Get DST aligned to 8 bytes. */ | |
56 | 1: bf 31,2f | |
57 | ||
58 | stb 4,0(10) | |
59 | addi 10,10,1 | |
60 | 2: bf 30,4f | |
61 | ||
62 | sth 4,0(10) | |
63 | addi 10,10,2 | |
64 | 4: bf 29,L(big_aligned) | |
65 | ||
66 | stw 4,0(10) | |
67 | addi 10,10,4 | |
68 | ||
69 | .align 4 | |
70 | L(big_aligned): | |
71 | ||
72 | cmpldi cr5,5,255 | |
73 | li 0,32 | |
74 | dcbtst 0,10 | |
75 | cmpldi cr6,4,0 | |
76 | srdi 9,5,3 /* Number of full doublewords remaining. */ | |
77 | crand 27,26,21 | |
78 | mtocrf 0x01,9 | |
79 | bt 27,L(huge) | |
80 | ||
81 | /* From this point on, we'll copy 32+ bytes and the value | |
82 | isn't 0 (so we can't use dcbz). */ | |
83 | ||
84 | srdi 8,5,5 | |
85 | clrldi 11,5,61 | |
86 | cmpldi cr6,11,0 | |
87 | cmpldi cr1,9,4 | |
88 | mtctr 8 | |
89 | ||
90 | /* Copy 1~3 doublewords so the main loop starts | |
91 | at a multiple of 32 bytes. */ | |
92 | ||
93 | bf 30,1f | |
94 | ||
95 | std 4,0(10) | |
96 | std 4,8(10) | |
97 | addi 10,10,16 | |
98 | bf 31,L(big_loop) | |
99 | ||
100 | std 4,0(10) | |
101 | addi 10,10,8 | |
102 | mr 12,10 | |
103 | blt cr1,L(tail_bytes) | |
104 | b L(big_loop) | |
105 | ||
106 | .align 4 | |
107 | 1: /* Copy 1 doubleword. */ | |
108 | bf 31,L(big_loop) | |
109 | ||
110 | std 4,0(10) | |
111 | addi 10,10,8 | |
112 | ||
113 | /* Main aligned copy loop. Copies 32-bytes at a time and | |
114 | ping-pong through r10 and r12 to avoid AGEN delays. */ | |
115 | .align 4 | |
116 | L(big_loop): | |
117 | addi 12,10,32 | |
118 | std 4,0(10) | |
119 | std 4,8(10) | |
120 | std 4,16(10) | |
121 | std 4,24(10) | |
122 | bdz L(tail_bytes) | |
123 | ||
124 | addi 10,10,64 | |
125 | std 4,0(12) | |
126 | std 4,8(12) | |
127 | std 4,16(12) | |
128 | std 4,24(12) | |
129 | bdnz L(big_loop) | |
130 | ||
131 | mr 12,10 | |
132 | b L(tail_bytes) | |
133 | ||
134 | .align 4 | |
135 | L(tail_bytes): | |
136 | ||
137 | /* Check for tail bytes. */ | |
138 | beqlr cr6 | |
139 | ||
140 | clrldi 0,5,61 | |
141 | mtocrf 0x01,0 | |
142 | ||
143 | /* At this point we have a tail of 0-7 bytes and we know that the | |
144 | destination is doubleword-aligned. */ | |
145 | 4: /* Copy 4 bytes. */ | |
146 | bf 29,2f | |
147 | ||
148 | stw 4,0(12) | |
149 | addi 12,12,4 | |
150 | 2: /* Copy 2 bytes. */ | |
151 | bf 30,1f | |
152 | ||
153 | sth 4,0(12) | |
154 | addi 12,12,2 | |
155 | 1: /* Copy 1 byte. */ | |
156 | bflr 31 | |
157 | ||
158 | stb 4,0(12) | |
159 | blr | |
160 | ||
161 | /* Special case when value is 0 and we have a long length to deal | |
162 | with. Use dcbz to zero out 128-bytes at a time. Before using | |
163 | dcbz though, we need to get the destination 128-bytes aligned. */ | |
164 | .align 4 | |
165 | L(huge): | |
166 | andi. 11,10,127 | |
167 | neg 0,10 | |
168 | beq L(huge_aligned) | |
169 | ||
170 | clrldi 0,0,57 | |
171 | subf 5,0,5 | |
172 | srdi 0,0,3 | |
173 | mtocrf 0x01,0 | |
174 | ||
175 | /* Get DST aligned to 128 bytes. */ | |
176 | 8: bf 28,4f | |
177 | ||
178 | std 4,0(10) | |
179 | std 4,8(10) | |
180 | std 4,16(10) | |
181 | std 4,24(10) | |
182 | std 4,32(10) | |
183 | std 4,40(10) | |
184 | std 4,48(10) | |
185 | std 4,56(10) | |
186 | addi 10,10,64 | |
187 | .align 4 | |
188 | 4: bf 29,2f | |
189 | ||
190 | std 4,0(10) | |
191 | std 4,8(10) | |
192 | std 4,16(10) | |
193 | std 4,24(10) | |
194 | addi 10,10,32 | |
195 | .align 4 | |
196 | 2: bf 30,1f | |
197 | ||
198 | std 4,0(10) | |
199 | std 4,8(10) | |
200 | addi 10,10,16 | |
201 | .align 4 | |
202 | 1: bf 31,L(huge_aligned) | |
203 | ||
204 | std 4,0(10) | |
205 | addi 10,10,8 | |
206 | ||
207 | ||
208 | L(huge_aligned): | |
209 | srdi 8,5,7 | |
210 | clrldi 11,5,57 | |
211 | cmpldi cr6,11,0 | |
212 | mtctr 8 | |
213 | ||
214 | .align 4 | |
215 | L(huge_loop): | |
216 | dcbz 0,10 | |
217 | addi 10,10,128 | |
218 | bdnz L(huge_loop) | |
219 | ||
220 | /* Check how many bytes are still left. */ | |
221 | beqlr cr6 | |
222 | ||
223 | subf 9,3,10 | |
224 | subf 5,9,12 | |
225 | srdi 8,5,3 | |
226 | cmpldi cr6,8,0 | |
227 | mtocrf 0x01,8 | |
228 | ||
229 | /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for | |
230 | speed. We'll handle the resulting tail bytes later. */ | |
231 | beq cr6,L(tail) | |
232 | ||
233 | 8: bf 28,4f | |
234 | ||
235 | std 4,0(10) | |
236 | std 4,8(10) | |
237 | std 4,16(10) | |
238 | std 4,24(10) | |
239 | std 4,32(10) | |
240 | std 4,40(10) | |
241 | std 4,48(10) | |
242 | std 4,56(10) | |
243 | addi 10,10,64 | |
244 | .align 4 | |
245 | 4: bf 29,2f | |
246 | ||
247 | std 4,0(10) | |
248 | std 4,8(10) | |
249 | std 4,16(10) | |
250 | std 4,24(10) | |
251 | addi 10,10,32 | |
252 | .align 4 | |
253 | 2: bf 30,1f | |
254 | ||
255 | std 4,0(10) | |
256 | std 4,8(10) | |
257 | addi 10,10,16 | |
258 | .align 4 | |
259 | 1: bf 31,L(tail) | |
260 | ||
261 | std 4,0(10) | |
262 | addi 10,10,8 | |
263 | ||
264 | /* Handle the rest of the tail bytes here. */ | |
265 | L(tail): | |
266 | mtocrf 0x01,5 | |
267 | ||
268 | .align 4 | |
269 | 4: bf 29,2f | |
270 | ||
271 | stw 4,0(10) | |
272 | addi 10,10,4 | |
273 | .align 4 | |
274 | 2: bf 30,1f | |
275 | ||
276 | sth 4,0(10) | |
277 | addi 10,10,2 | |
278 | .align 4 | |
279 | 1: bflr 31 | |
280 | ||
281 | stb 4,0(10) | |
282 | blr | |
283 | ||
284 | /* Expanded tree to copy tail bytes without increments. */ | |
285 | .align 4 | |
286 | L(copy_tail): | |
287 | bf 29,L(FXX) | |
288 | ||
289 | stw 4,0(10) | |
290 | bf 30,L(TFX) | |
291 | ||
292 | sth 4,4(10) | |
293 | bflr 31 | |
294 | ||
295 | stb 4,6(10) | |
296 | blr | |
297 | ||
298 | .align 4 | |
299 | L(FXX): bf 30,L(FFX) | |
300 | ||
301 | sth 4,0(10) | |
302 | bflr 31 | |
303 | ||
304 | stb 4,2(10) | |
305 | blr | |
306 | ||
307 | .align 4 | |
308 | L(TFX): bflr 31 | |
309 | ||
310 | stb 4,4(10) | |
311 | blr | |
312 | ||
313 | .align 4 | |
314 | L(FFX): bflr 31 | |
315 | ||
316 | stb 4,0(10) | |
317 | blr | |
318 | ||
319 | /* Handle copies of 9~31 bytes. */ | |
320 | .align 4 | |
321 | L(medium): | |
322 | /* At least 9 bytes to go. */ | |
323 | andi. 11,10,3 | |
324 | clrldi 0,0,62 | |
325 | beq L(medium_aligned) | |
326 | ||
3be87c77 | 327 | /* Force 4-bytes alignment for DST. */ |
33b8d90a LM |
328 | mtocrf 0x01,0 |
329 | subf 5,0,5 | |
330 | 1: /* Copy 1 byte. */ | |
331 | bf 31,2f | |
332 | ||
333 | stb 4,0(10) | |
334 | addi 10,10,1 | |
335 | 2: /* Copy 2 bytes. */ | |
336 | bf 30,L(medium_aligned) | |
337 | ||
338 | sth 4,0(10) | |
339 | addi 10,10,2 | |
340 | ||
341 | .align 4 | |
342 | L(medium_aligned): | |
343 | /* At least 6 bytes to go, and DST is word-aligned. */ | |
344 | cmpldi cr1,5,16 | |
345 | mtocrf 0x01,5 | |
346 | blt cr1,8f | |
347 | ||
348 | /* Copy 16 bytes. */ | |
349 | stw 4,0(10) | |
350 | stw 4,4(10) | |
351 | stw 4,8(10) | |
352 | stw 4,12(10) | |
353 | addi 10,10,16 | |
354 | 8: /* Copy 8 bytes. */ | |
355 | bf 28,4f | |
356 | ||
357 | stw 4,0(10) | |
358 | stw 4,4(10) | |
359 | addi 10,10,8 | |
360 | 4: /* Copy 4 bytes. */ | |
361 | bf 29,2f | |
362 | ||
363 | stw 4,0(10) | |
364 | addi 10,10,4 | |
365 | 2: /* Copy 2-3 bytes. */ | |
366 | bf 30,1f | |
367 | ||
368 | sth 4,0(10) | |
369 | addi 10,10,2 | |
370 | 1: /* Copy 1 byte. */ | |
371 | bflr 31 | |
372 | ||
373 | stb 4,0(10) | |
374 | blr | |
375 | ||
376 | /* Handles copies of 0~8 bytes. */ | |
377 | .align 4 | |
378 | L(small): | |
379 | mtocrf 0x01,5 | |
380 | bne cr6,L(copy_tail) | |
381 | ||
382 | stw 4,0(10) | |
383 | stw 4,4(10) | |
384 | blr | |
385 | ||
18e0054b | 386 | END_GEN_TB (MEMSET,TB_TOCLESS) |
33b8d90a LM |
387 | libc_hidden_builtin_def (memset) |
388 | ||
389 | /* Copied from bzero.S to prevent the linker from inserting a stub | |
390 | between bzero and memset. */ | |
d5b41185 | 391 | ENTRY_TOCLESS (__bzero) |
33b8d90a LM |
392 | CALL_MCOUNT 3 |
393 | mr r5,r4 | |
394 | li r4,0 | |
395 | b L(_memset) | |
3b473fec AZ |
396 | END (__bzero) |
397 | #ifndef __bzero | |
2d67d91a | 398 | weak_alias (__bzero, bzero) |
8a29a3d0 | 399 | #endif |