]>
Commit | Line | Data |
---|---|---|
33b8d90a | 1 | /* Optimized memset implementation for PowerPC64/POWER7. |
2b778ceb | 2 | Copyright (C) 2010-2021 Free Software Foundation, Inc. |
33b8d90a LM |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 16 | License along with the GNU C Library; if not, see |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
33b8d90a LM |
18 | |
19 | #include <sysdep.h> | |
33b8d90a | 20 | |
f17a4233 | 21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
33b8d90a LM |
22 | Returns 's'. */ |
23 | ||
18e0054b WSM |
24 | #ifndef MEMSET |
25 | # define MEMSET memset | |
26 | #endif | |
33b8d90a | 27 | .machine power7 |
d5b41185 | 28 | ENTRY_TOCLESS (MEMSET, 5) |
33b8d90a LM |
29 | CALL_MCOUNT 3 |
30 | ||
31 | L(_memset): | |
32 | cmpldi cr7,5,31 | |
33 | cmpldi cr6,5,8 | |
34 | mr 10,3 | |
35 | ||
36 | /* Replicate byte to word. */ | |
3be87c77 AM |
37 | insrdi 4,4,8,48 |
38 | insrdi 4,4,16,32 | |
33b8d90a LM |
39 | ble cr6,L(small) /* If length <= 8, use short copy code. */ |
40 | ||
41 | neg 0,3 | |
42 | ble cr7,L(medium) /* If length < 32, use medium copy code. */ | |
43 | ||
44 | andi. 11,10,7 /* Check alignment of SRC. */ | |
45 | insrdi 4,4,32,0 /* Replicate word to double word. */ | |
46 | ||
47 | mr 12,5 | |
48 | beq L(big_aligned) | |
49 | ||
50 | clrldi 0,0,61 | |
51 | mtocrf 0x01,0 | |
52 | subf 5,0,5 | |
53 | ||
54 | /* Get DST aligned to 8 bytes. */ | |
55 | 1: bf 31,2f | |
56 | ||
57 | stb 4,0(10) | |
58 | addi 10,10,1 | |
59 | 2: bf 30,4f | |
60 | ||
61 | sth 4,0(10) | |
62 | addi 10,10,2 | |
63 | 4: bf 29,L(big_aligned) | |
64 | ||
65 | stw 4,0(10) | |
66 | addi 10,10,4 | |
67 | ||
68 | .align 4 | |
69 | L(big_aligned): | |
70 | ||
71 | cmpldi cr5,5,255 | |
72 | li 0,32 | |
73 | dcbtst 0,10 | |
74 | cmpldi cr6,4,0 | |
75 | srdi 9,5,3 /* Number of full doublewords remaining. */ | |
76 | crand 27,26,21 | |
77 | mtocrf 0x01,9 | |
78 | bt 27,L(huge) | |
79 | ||
80 | /* From this point on, we'll copy 32+ bytes and the value | |
81 | isn't 0 (so we can't use dcbz). */ | |
82 | ||
83 | srdi 8,5,5 | |
84 | clrldi 11,5,61 | |
85 | cmpldi cr6,11,0 | |
86 | cmpldi cr1,9,4 | |
87 | mtctr 8 | |
88 | ||
89 | /* Copy 1~3 doublewords so the main loop starts | |
90 | at a multiple of 32 bytes. */ | |
91 | ||
92 | bf 30,1f | |
93 | ||
94 | std 4,0(10) | |
95 | std 4,8(10) | |
96 | addi 10,10,16 | |
97 | bf 31,L(big_loop) | |
98 | ||
99 | std 4,0(10) | |
100 | addi 10,10,8 | |
101 | mr 12,10 | |
102 | blt cr1,L(tail_bytes) | |
103 | b L(big_loop) | |
104 | ||
105 | .align 4 | |
106 | 1: /* Copy 1 doubleword. */ | |
107 | bf 31,L(big_loop) | |
108 | ||
109 | std 4,0(10) | |
110 | addi 10,10,8 | |
111 | ||
112 | /* Main aligned copy loop. Copies 32-bytes at a time and | |
113 | ping-pong through r10 and r12 to avoid AGEN delays. */ | |
114 | .align 4 | |
115 | L(big_loop): | |
116 | addi 12,10,32 | |
117 | std 4,0(10) | |
118 | std 4,8(10) | |
119 | std 4,16(10) | |
120 | std 4,24(10) | |
121 | bdz L(tail_bytes) | |
122 | ||
123 | addi 10,10,64 | |
124 | std 4,0(12) | |
125 | std 4,8(12) | |
126 | std 4,16(12) | |
127 | std 4,24(12) | |
128 | bdnz L(big_loop) | |
129 | ||
130 | mr 12,10 | |
131 | b L(tail_bytes) | |
132 | ||
133 | .align 4 | |
134 | L(tail_bytes): | |
135 | ||
136 | /* Check for tail bytes. */ | |
137 | beqlr cr6 | |
138 | ||
139 | clrldi 0,5,61 | |
140 | mtocrf 0x01,0 | |
141 | ||
142 | /* At this point we have a tail of 0-7 bytes and we know that the | |
143 | destination is doubleword-aligned. */ | |
144 | 4: /* Copy 4 bytes. */ | |
145 | bf 29,2f | |
146 | ||
147 | stw 4,0(12) | |
148 | addi 12,12,4 | |
149 | 2: /* Copy 2 bytes. */ | |
150 | bf 30,1f | |
151 | ||
152 | sth 4,0(12) | |
153 | addi 12,12,2 | |
154 | 1: /* Copy 1 byte. */ | |
155 | bflr 31 | |
156 | ||
157 | stb 4,0(12) | |
158 | blr | |
159 | ||
160 | /* Special case when value is 0 and we have a long length to deal | |
161 | with. Use dcbz to zero out 128-bytes at a time. Before using | |
162 | dcbz though, we need to get the destination 128-bytes aligned. */ | |
163 | .align 4 | |
164 | L(huge): | |
165 | andi. 11,10,127 | |
166 | neg 0,10 | |
167 | beq L(huge_aligned) | |
168 | ||
169 | clrldi 0,0,57 | |
170 | subf 5,0,5 | |
171 | srdi 0,0,3 | |
172 | mtocrf 0x01,0 | |
173 | ||
174 | /* Get DST aligned to 128 bytes. */ | |
175 | 8: bf 28,4f | |
176 | ||
177 | std 4,0(10) | |
178 | std 4,8(10) | |
179 | std 4,16(10) | |
180 | std 4,24(10) | |
181 | std 4,32(10) | |
182 | std 4,40(10) | |
183 | std 4,48(10) | |
184 | std 4,56(10) | |
185 | addi 10,10,64 | |
186 | .align 4 | |
187 | 4: bf 29,2f | |
188 | ||
189 | std 4,0(10) | |
190 | std 4,8(10) | |
191 | std 4,16(10) | |
192 | std 4,24(10) | |
193 | addi 10,10,32 | |
194 | .align 4 | |
195 | 2: bf 30,1f | |
196 | ||
197 | std 4,0(10) | |
198 | std 4,8(10) | |
199 | addi 10,10,16 | |
200 | .align 4 | |
201 | 1: bf 31,L(huge_aligned) | |
202 | ||
203 | std 4,0(10) | |
204 | addi 10,10,8 | |
205 | ||
206 | ||
207 | L(huge_aligned): | |
208 | srdi 8,5,7 | |
209 | clrldi 11,5,57 | |
210 | cmpldi cr6,11,0 | |
211 | mtctr 8 | |
212 | ||
213 | .align 4 | |
214 | L(huge_loop): | |
215 | dcbz 0,10 | |
216 | addi 10,10,128 | |
217 | bdnz L(huge_loop) | |
218 | ||
219 | /* Check how many bytes are still left. */ | |
220 | beqlr cr6 | |
221 | ||
222 | subf 9,3,10 | |
223 | subf 5,9,12 | |
224 | srdi 8,5,3 | |
225 | cmpldi cr6,8,0 | |
226 | mtocrf 0x01,8 | |
227 | ||
228 | /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for | |
229 | speed. We'll handle the resulting tail bytes later. */ | |
230 | beq cr6,L(tail) | |
231 | ||
232 | 8: bf 28,4f | |
233 | ||
234 | std 4,0(10) | |
235 | std 4,8(10) | |
236 | std 4,16(10) | |
237 | std 4,24(10) | |
238 | std 4,32(10) | |
239 | std 4,40(10) | |
240 | std 4,48(10) | |
241 | std 4,56(10) | |
242 | addi 10,10,64 | |
243 | .align 4 | |
244 | 4: bf 29,2f | |
245 | ||
246 | std 4,0(10) | |
247 | std 4,8(10) | |
248 | std 4,16(10) | |
249 | std 4,24(10) | |
250 | addi 10,10,32 | |
251 | .align 4 | |
252 | 2: bf 30,1f | |
253 | ||
254 | std 4,0(10) | |
255 | std 4,8(10) | |
256 | addi 10,10,16 | |
257 | .align 4 | |
258 | 1: bf 31,L(tail) | |
259 | ||
260 | std 4,0(10) | |
261 | addi 10,10,8 | |
262 | ||
263 | /* Handle the rest of the tail bytes here. */ | |
264 | L(tail): | |
265 | mtocrf 0x01,5 | |
266 | ||
267 | .align 4 | |
268 | 4: bf 29,2f | |
269 | ||
270 | stw 4,0(10) | |
271 | addi 10,10,4 | |
272 | .align 4 | |
273 | 2: bf 30,1f | |
274 | ||
275 | sth 4,0(10) | |
276 | addi 10,10,2 | |
277 | .align 4 | |
278 | 1: bflr 31 | |
279 | ||
280 | stb 4,0(10) | |
281 | blr | |
282 | ||
283 | /* Expanded tree to copy tail bytes without increments. */ | |
284 | .align 4 | |
285 | L(copy_tail): | |
286 | bf 29,L(FXX) | |
287 | ||
288 | stw 4,0(10) | |
289 | bf 30,L(TFX) | |
290 | ||
291 | sth 4,4(10) | |
292 | bflr 31 | |
293 | ||
294 | stb 4,6(10) | |
295 | blr | |
296 | ||
297 | .align 4 | |
298 | L(FXX): bf 30,L(FFX) | |
299 | ||
300 | sth 4,0(10) | |
301 | bflr 31 | |
302 | ||
303 | stb 4,2(10) | |
304 | blr | |
305 | ||
306 | .align 4 | |
307 | L(TFX): bflr 31 | |
308 | ||
309 | stb 4,4(10) | |
310 | blr | |
311 | ||
312 | .align 4 | |
313 | L(FFX): bflr 31 | |
314 | ||
315 | stb 4,0(10) | |
316 | blr | |
317 | ||
318 | /* Handle copies of 9~31 bytes. */ | |
319 | .align 4 | |
320 | L(medium): | |
321 | /* At least 9 bytes to go. */ | |
322 | andi. 11,10,3 | |
323 | clrldi 0,0,62 | |
324 | beq L(medium_aligned) | |
325 | ||
3be87c77 | 326 | /* Force 4-bytes alignment for DST. */ |
33b8d90a LM |
327 | mtocrf 0x01,0 |
328 | subf 5,0,5 | |
329 | 1: /* Copy 1 byte. */ | |
330 | bf 31,2f | |
331 | ||
332 | stb 4,0(10) | |
333 | addi 10,10,1 | |
334 | 2: /* Copy 2 bytes. */ | |
335 | bf 30,L(medium_aligned) | |
336 | ||
337 | sth 4,0(10) | |
338 | addi 10,10,2 | |
339 | ||
340 | .align 4 | |
341 | L(medium_aligned): | |
342 | /* At least 6 bytes to go, and DST is word-aligned. */ | |
343 | cmpldi cr1,5,16 | |
344 | mtocrf 0x01,5 | |
345 | blt cr1,8f | |
346 | ||
347 | /* Copy 16 bytes. */ | |
348 | stw 4,0(10) | |
349 | stw 4,4(10) | |
350 | stw 4,8(10) | |
351 | stw 4,12(10) | |
352 | addi 10,10,16 | |
353 | 8: /* Copy 8 bytes. */ | |
354 | bf 28,4f | |
355 | ||
356 | stw 4,0(10) | |
357 | stw 4,4(10) | |
358 | addi 10,10,8 | |
359 | 4: /* Copy 4 bytes. */ | |
360 | bf 29,2f | |
361 | ||
362 | stw 4,0(10) | |
363 | addi 10,10,4 | |
364 | 2: /* Copy 2-3 bytes. */ | |
365 | bf 30,1f | |
366 | ||
367 | sth 4,0(10) | |
368 | addi 10,10,2 | |
369 | 1: /* Copy 1 byte. */ | |
370 | bflr 31 | |
371 | ||
372 | stb 4,0(10) | |
373 | blr | |
374 | ||
375 | /* Handles copies of 0~8 bytes. */ | |
376 | .align 4 | |
377 | L(small): | |
378 | mtocrf 0x01,5 | |
379 | bne cr6,L(copy_tail) | |
380 | ||
381 | stw 4,0(10) | |
382 | stw 4,4(10) | |
383 | blr | |
384 | ||
18e0054b | 385 | END_GEN_TB (MEMSET,TB_TOCLESS) |
33b8d90a LM |
386 | libc_hidden_builtin_def (memset) |
387 | ||
388 | /* Copied from bzero.S to prevent the linker from inserting a stub | |
389 | between bzero and memset. */ | |
d5b41185 | 390 | ENTRY_TOCLESS (__bzero) |
33b8d90a LM |
391 | CALL_MCOUNT 3 |
392 | mr r5,r4 | |
393 | li r4,0 | |
394 | b L(_memset) | |
3b473fec AZ |
395 | END (__bzero) |
396 | #ifndef __bzero | |
2d67d91a | 397 | weak_alias (__bzero, bzero) |
8a29a3d0 | 398 | #endif |