]>
Commit | Line | Data |
---|---|---|
a88f47a7 | 1 | /* Optimized 32-bit memset implementation for POWER6. |
bfff8b1b | 2 | Copyright (C) 1997-2017 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
f17a4233 | 21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
04067002 UD |
22 | Returns 's'. |
23 | ||
24 | The memset is done in three sizes: byte (8 bits), word (32 bits), | |
25 | cache line (1024 bits). There is a special case for setting cache lines | |
26 | to 0, to take advantage of the dcbz instruction. */ | |
27 | ||
a88f47a7 | 28 | .machine power6 |
b5510883 | 29 | EALIGN (memset, 7, 0) |
04067002 UD |
30 | CALL_MCOUNT |
31 | ||
32 | #define rTMP r0 | |
33 | #define rRTN r3 /* Initial value of 1st argument. */ | |
34 | #define rMEMP0 r3 /* Original value of 1st arg. */ | |
35 | #define rCHR r4 /* Char to set in each byte. */ | |
36 | #define rLEN r5 /* Length of region to set. */ | |
37 | #define rMEMP r6 /* Address at which we are storing. */ | |
38 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ | |
39 | #define rMEMP2 r8 | |
40 | ||
41 | #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ | |
a88f47a7 | 42 | #define rMEMP3 r9 /* Alt mem pointer. */ |
04067002 UD |
43 | L(_memset): |
44 | /* Take care of case for size <= 4. */ | |
45 | cmplwi cr1, rLEN, 4 | |
46 | andi. rALIGN, rMEMP0, 3 | |
47 | mr rMEMP, rMEMP0 | |
48 | ble- cr1, L(small) | |
04067002 UD |
49 | /* Align to word boundary. */ |
50 | cmplwi cr5, rLEN, 31 | |
d298c416 | 51 | insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */ |
04067002 UD |
52 | beq+ L(aligned) |
53 | mtcrf 0x01, rMEMP0 | |
54 | subfic rALIGN, rALIGN, 4 | |
55 | add rMEMP, rMEMP, rALIGN | |
56 | sub rLEN, rLEN, rALIGN | |
57 | bf+ 31, L(g0) | |
58 | stb rCHR, 0(rMEMP0) | |
59 | bt 30, L(aligned) | |
60 | L(g0): | |
61 | sth rCHR, -2(rMEMP) | |
62 | ||
63 | .align 4 | |
64 | /* Handle the case of size < 31. */ | |
65 | L(aligned): | |
66 | mtcrf 0x01, rLEN | |
d298c416 | 67 | insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */ |
04067002 UD |
68 | ble cr5, L(medium) |
69 | /* Align to 32-byte boundary. */ | |
70 | andi. rALIGN, rMEMP, 0x1C | |
71 | subfic rALIGN, rALIGN, 0x20 | |
72 | beq L(caligned) | |
73 | mtcrf 0x01, rALIGN | |
74 | add rMEMP, rMEMP, rALIGN | |
75 | sub rLEN, rLEN, rALIGN | |
76 | cmplwi cr1, rALIGN, 0x10 | |
77 | mr rMEMP2, rMEMP | |
78 | bf 28, L(a1) | |
79 | stw rCHR, -4(rMEMP2) | |
80 | stwu rCHR, -8(rMEMP2) | |
a88f47a7 | 81 | nop |
04067002 UD |
82 | L(a1): blt cr1, L(a2) |
83 | stw rCHR, -4(rMEMP2) | |
84 | stw rCHR, -8(rMEMP2) | |
85 | stw rCHR, -12(rMEMP2) | |
86 | stwu rCHR, -16(rMEMP2) | |
87 | L(a2): bf 29, L(caligned) | |
88 | stw rCHR, -4(rMEMP2) | |
89 | ||
a88f47a7 | 90 | .align 3 |
04067002 UD |
91 | /* Now aligned to a 32 byte boundary. */ |
92 | L(caligned): | |
93 | cmplwi cr1, rCHR, 0 | |
94 | clrrwi. rALIGN, rLEN, 5 | |
95 | mtcrf 0x01, rLEN | |
96 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ | |
97 | L(nondcbz): | |
04067002 | 98 | beq L(medium) /* We may not actually get to do a full line. */ |
a88f47a7 UD |
99 | nop |
100 | /* Storing a non-zero "c" value. We are aligned at a sector (32-byte) | |
101 | boundary may not be at cache line (128-byte) boundary. */ | |
102 | L(nzloopstart): | |
103 | /* memset in 32-byte chunks until we get to a cache line boundary. | |
f24a6d08 | 104 | If rLEN is less than the distance to the next cache-line boundary use |
a88f47a7 UD |
105 | cacheAligned1 code to finish the tail. */ |
106 | cmplwi cr1,rLEN,128 | |
107 | ||
108 | andi. rTMP,rMEMP,127 | |
109 | blt cr1,L(cacheAligned1) | |
110 | addi rMEMP3,rMEMP,32 | |
111 | beq L(nzCacheAligned) | |
112 | addi rLEN,rLEN,-32 | |
113 | stw rCHR,0(rMEMP) | |
114 | stw rCHR,4(rMEMP) | |
115 | stw rCHR,8(rMEMP) | |
116 | stw rCHR,12(rMEMP) | |
117 | stw rCHR,16(rMEMP) | |
118 | stw rCHR,20(rMEMP) | |
119 | addi rMEMP,rMEMP,32 | |
120 | andi. rTMP,rMEMP3,127 | |
121 | stw rCHR,-8(rMEMP3) | |
122 | stw rCHR,-4(rMEMP3) | |
04067002 | 123 | |
a88f47a7 UD |
124 | beq L(nzCacheAligned) |
125 | addi rLEN,rLEN,-32 | |
126 | stw rCHR,0(rMEMP3) | |
127 | stw rCHR,4(rMEMP3) | |
128 | addi rMEMP,rMEMP,32 | |
129 | stw rCHR,8(rMEMP3) | |
130 | stw rCHR,12(rMEMP3) | |
131 | andi. rTMP,rMEMP,127 | |
132 | stw rCHR,16(rMEMP3) | |
133 | stw rCHR,20(rMEMP3) | |
134 | stw rCHR,24(rMEMP3) | |
135 | stw rCHR,28(rMEMP3) | |
136 | ||
137 | beq L(nzCacheAligned) | |
138 | addi rLEN,rLEN,-32 | |
139 | /* At this point we can overrun the store queue (pipe reject) so it is | |
140 | time to slow things down. The store queue can merge two adjacent | |
141 | stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. | |
142 | So we add "group ending nops" to guarantee that we dispatch only two | |
143 | stores every other cycle. */ | |
144 | ori r1,r1,0 | |
145 | ori r1,r1,0 | |
146 | stw rCHR,32(rMEMP3) | |
147 | stw rCHR,36(rMEMP3) | |
148 | addi rMEMP,rMEMP,32 | |
149 | cmplwi cr1,rLEN,128 | |
150 | ori r1,r1,0 | |
151 | ori r1,r1,0 | |
152 | stw rCHR,40(rMEMP3) | |
153 | stw rCHR,44(rMEMP3) | |
154 | ori r1,r1,0 | |
155 | ori r1,r1,0 | |
156 | stw rCHR,48(rMEMP3) | |
157 | stw rCHR,52(rMEMP3) | |
158 | ori r1,r1,0 | |
159 | ori r1,r1,0 | |
160 | stw rCHR,56(rMEMP3) | |
161 | stw rCHR,60(rMEMP3) | |
162 | blt cr1,L(cacheAligned1) | |
163 | b L(nzCacheAligned) | |
164 | ||
165 | /* Now we are aligned to the cache line and can use dcbtst. */ | |
166 | .align 5 | |
167 | L(nzCacheAligned): | |
168 | cmplwi cr1,rLEN,128 | |
169 | cmplwi cr6,rLEN,256 | |
170 | blt cr1,L(cacheAligned1) | |
171 | blt cr6,L(nzCacheAligned128) | |
04067002 | 172 | .align 4 |
a88f47a7 UD |
173 | L(nzCacheAligned128): |
174 | nop | |
175 | addi rMEMP3,rMEMP,64 | |
176 | stw rCHR,0(rMEMP) | |
177 | stw rCHR,4(rMEMP) | |
178 | stw rCHR,8(rMEMP) | |
179 | stw rCHR,12(rMEMP) | |
180 | stw rCHR,16(rMEMP) | |
181 | stw rCHR,20(rMEMP) | |
182 | stw rCHR,24(rMEMP) | |
183 | stw rCHR,28(rMEMP) | |
184 | stw rCHR,32(rMEMP) | |
185 | stw rCHR,36(rMEMP) | |
186 | stw rCHR,40(rMEMP) | |
187 | stw rCHR,44(rMEMP) | |
188 | stw rCHR,48(rMEMP) | |
189 | stw rCHR,52(rMEMP) | |
190 | stw rCHR,56(rMEMP) | |
191 | stw rCHR,60(rMEMP) | |
192 | addi rMEMP,rMEMP3,64 | |
193 | addi rLEN,rLEN,-128 | |
194 | /* At this point we can overrun the store queue (pipe reject) so it is | |
195 | time to slow things down. The store queue can merge two adjacent | |
196 | stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. | |
197 | So we add "group ending nops" to guarantee that we dispatch only one | |
198 | store per cycle. */ | |
199 | stw rCHR,0(rMEMP3) | |
200 | ori r1,r1,0 | |
201 | stw rCHR,4(rMEMP3) | |
202 | ori r1,r1,0 | |
203 | stw rCHR,8(rMEMP3) | |
204 | ori r1,r1,0 | |
205 | stw rCHR,12(rMEMP3) | |
206 | ori r1,r1,0 | |
207 | stw rCHR,16(rMEMP3) | |
208 | ori r1,r1,0 | |
209 | stw rCHR,20(rMEMP3) | |
210 | ori r1,r1,0 | |
211 | stw rCHR,24(rMEMP3) | |
212 | ori r1,r1,0 | |
213 | stw rCHR,28(rMEMP3) | |
214 | ori r1,r1,0 | |
215 | stw rCHR,32(rMEMP3) | |
216 | ori r1,r1,0 | |
217 | stw rCHR,36(rMEMP3) | |
218 | ori r1,r1,0 | |
219 | stw rCHR,40(rMEMP3) | |
220 | ori r1,r1,0 | |
221 | stw rCHR,44(rMEMP3) | |
222 | ori r1,r1,0 | |
223 | stw rCHR,48(rMEMP3) | |
224 | ori r1,r1,0 | |
225 | stw rCHR,52(rMEMP3) | |
226 | ori r1,r1,0 | |
227 | stw rCHR,56(rMEMP3) | |
228 | ori r1,r1,0 | |
229 | stw rCHR,60(rMEMP3) | |
230 | blt cr6,L(cacheAligned1) | |
4f41c682 | 231 | #if IS_IN (libc) |
a88f47a7 UD |
232 | lfd 0,-128(rMEMP) |
233 | #endif | |
234 | b L(nzCacheAligned256) | |
235 | .align 5 | |
236 | L(nzCacheAligned256): | |
237 | cmplwi cr1,rLEN,256 | |
238 | addi rMEMP3,rMEMP,64 | |
4f41c682 | 239 | #if !IS_IN (libc) |
25bfbb9e | 240 | /* When we are not in libc we should use only GPRs to avoid the FPU lock |
a88f47a7 UD |
241 | interrupt. */ |
242 | stw rCHR,0(rMEMP) | |
243 | stw rCHR,4(rMEMP) | |
244 | stw rCHR,8(rMEMP) | |
245 | stw rCHR,12(rMEMP) | |
246 | stw rCHR,16(rMEMP) | |
247 | stw rCHR,20(rMEMP) | |
248 | stw rCHR,24(rMEMP) | |
249 | stw rCHR,28(rMEMP) | |
250 | stw rCHR,32(rMEMP) | |
251 | stw rCHR,36(rMEMP) | |
252 | stw rCHR,40(rMEMP) | |
253 | stw rCHR,44(rMEMP) | |
254 | stw rCHR,48(rMEMP) | |
255 | stw rCHR,52(rMEMP) | |
256 | stw rCHR,56(rMEMP) | |
257 | stw rCHR,60(rMEMP) | |
258 | addi rMEMP,rMEMP3,64 | |
259 | addi rLEN,rLEN,-128 | |
260 | stw rCHR,0(rMEMP3) | |
261 | stw rCHR,4(rMEMP3) | |
262 | stw rCHR,8(rMEMP3) | |
263 | stw rCHR,12(rMEMP3) | |
264 | stw rCHR,16(rMEMP3) | |
265 | stw rCHR,20(rMEMP3) | |
266 | stw rCHR,24(rMEMP3) | |
267 | stw rCHR,28(rMEMP3) | |
268 | stw rCHR,32(rMEMP3) | |
269 | stw rCHR,36(rMEMP3) | |
270 | stw rCHR,40(rMEMP3) | |
271 | stw rCHR,44(rMEMP3) | |
272 | stw rCHR,48(rMEMP3) | |
273 | stw rCHR,52(rMEMP3) | |
274 | stw rCHR,56(rMEMP3) | |
275 | stw rCHR,60(rMEMP3) | |
276 | #else | |
277 | /* We are in libc and this is a long memset so we can use FPRs and can afford | |
278 | occasional FPU locked interrupts. */ | |
279 | stfd 0,0(rMEMP) | |
280 | stfd 0,8(rMEMP) | |
281 | stfd 0,16(rMEMP) | |
282 | stfd 0,24(rMEMP) | |
283 | stfd 0,32(rMEMP) | |
284 | stfd 0,40(rMEMP) | |
285 | stfd 0,48(rMEMP) | |
286 | stfd 0,56(rMEMP) | |
287 | addi rMEMP,rMEMP3,64 | |
288 | addi rLEN,rLEN,-128 | |
289 | stfd 0,0(rMEMP3) | |
290 | stfd 0,8(rMEMP3) | |
291 | stfd 0,16(rMEMP3) | |
292 | stfd 0,24(rMEMP3) | |
293 | stfd 0,32(rMEMP3) | |
294 | stfd 0,40(rMEMP3) | |
295 | stfd 0,48(rMEMP3) | |
296 | stfd 0,56(rMEMP3) | |
297 | #endif | |
298 | bge cr1,L(nzCacheAligned256) | |
299 | dcbtst 0,rMEMP | |
300 | b L(cacheAligned1) | |
04067002 | 301 | |
a88f47a7 UD |
302 | .align 4 |
303 | /* Storing a zero "c" value. We are aligned at a sector (32-byte) | |
304 | boundary but may not be at cache line (128-byte) boundary. If the | |
305 | remaining length spans a full cache line we can use the Data cache | |
306 | block zero instruction. */ | |
04067002 | 307 | L(zloopstart): |
a88f47a7 | 308 | /* memset in 32-byte chunks until we get to a cache line boundary. |
f24a6d08 | 309 | If rLEN is less than the distance to the next cache-line boundary use |
a88f47a7 UD |
310 | cacheAligned1 code to finish the tail. */ |
311 | cmplwi cr1,rLEN,128 | |
04067002 | 312 | beq L(medium) |
04067002 | 313 | L(getCacheAligned): |
04067002 | 314 | andi. rTMP,rMEMP,127 |
a88f47a7 UD |
315 | blt cr1,L(cacheAligned1) |
316 | addi rMEMP3,rMEMP,32 | |
317 | beq L(cacheAligned) | |
318 | addi rLEN,rLEN,-32 | |
319 | stw rCHR,0(rMEMP) | |
320 | stw rCHR,4(rMEMP) | |
321 | stw rCHR,8(rMEMP) | |
322 | stw rCHR,12(rMEMP) | |
323 | stw rCHR,16(rMEMP) | |
324 | stw rCHR,20(rMEMP) | |
325 | addi rMEMP,rMEMP,32 | |
326 | andi. rTMP,rMEMP3,127 | |
327 | stw rCHR,-8(rMEMP3) | |
328 | stw rCHR,-4(rMEMP3) | |
329 | L(getCacheAligned2): | |
04067002 | 330 | beq L(cacheAligned) |
a88f47a7 | 331 | addi rLEN,rLEN,-32 |
04067002 | 332 | addi rMEMP,rMEMP,32 |
a88f47a7 UD |
333 | stw rCHR,0(rMEMP3) |
334 | stw rCHR,4(rMEMP3) | |
335 | stw rCHR,8(rMEMP3) | |
336 | stw rCHR,12(rMEMP3) | |
337 | andi. rTMP,rMEMP,127 | |
338 | nop | |
339 | stw rCHR,16(rMEMP3) | |
340 | stw rCHR,20(rMEMP3) | |
341 | stw rCHR,24(rMEMP3) | |
342 | stw rCHR,28(rMEMP3) | |
343 | L(getCacheAligned3): | |
344 | beq L(cacheAligned) | |
345 | /* At this point we can overrun the store queue (pipe reject) so it is | |
346 | time to slow things down. The store queue can merge two adjacent | |
347 | stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. | |
348 | So we add "group ending nops" to guarantee that we dispatch only two | |
349 | stores every other cycle. */ | |
04067002 | 350 | addi rLEN,rLEN,-32 |
a88f47a7 UD |
351 | ori r1,r1,0 |
352 | ori r1,r1,0 | |
353 | stw rCHR,32(rMEMP3) | |
354 | stw rCHR,36(rMEMP3) | |
355 | addi rMEMP,rMEMP,32 | |
356 | cmplwi cr1,rLEN,128 | |
357 | ori r1,r1,0 | |
358 | stw rCHR,40(rMEMP3) | |
359 | stw rCHR,44(rMEMP3) | |
360 | cmplwi cr6,rLEN,256 | |
361 | li rMEMP2,128 | |
362 | ori r1,r1,0 | |
363 | stw rCHR,48(rMEMP3) | |
364 | stw rCHR,52(rMEMP3) | |
365 | ori r1,r1,0 | |
366 | ori r1,r1,0 | |
367 | stw rCHR,56(rMEMP3) | |
368 | stw rCHR,60(rMEMP3) | |
369 | blt cr1,L(cacheAligned1) | |
370 | blt cr6,L(cacheAligned128) | |
371 | b L(cacheAlignedx) | |
04067002 UD |
372 | |
373 | /* Now we are aligned to the cache line and can use dcbz. */ | |
374 | .align 4 | |
375 | L(cacheAligned): | |
a88f47a7 UD |
376 | cmplwi cr1,rLEN,128 |
377 | cmplwi cr6,rLEN,256 | |
378 | blt cr1,L(cacheAligned1) | |
379 | li rMEMP2,128 | |
380 | L(cacheAlignedx): | |
25bfbb9e | 381 | cmplwi cr5,rLEN,640 |
a88f47a7 UD |
382 | blt cr6,L(cacheAligned128) |
383 | bgt cr5,L(cacheAligned512) | |
384 | cmplwi cr6,rLEN,512 | |
04067002 | 385 | dcbz 0,rMEMP |
a88f47a7 UD |
386 | cmplwi cr1,rLEN,384 |
387 | dcbz rMEMP2,rMEMP | |
388 | addi rMEMP,rMEMP,256 | |
389 | addi rLEN,rLEN,-256 | |
390 | blt cr1,L(cacheAligned1) | |
391 | blt cr6,L(cacheAligned128) | |
392 | b L(cacheAligned256) | |
393 | .align 5 | |
394 | /* A simple loop for the longer (>640 bytes) lengths. This form limits | |
395 | the branch miss-predicted to exactly 1 at loop exit.*/ | |
396 | L(cacheAligned512): | |
78b7adba | 397 | cmplwi cr1,rLEN,128 |
a88f47a7 UD |
398 | blt cr1,L(cacheAligned1) |
399 | dcbz 0,rMEMP | |
400 | addi rLEN,rLEN,-128 | |
401 | addi rMEMP,rMEMP,128 | |
402 | b L(cacheAligned512) | |
403 | .align 5 | |
404 | L(cacheAligned256): | |
405 | cmplwi cr6,rLEN,512 | |
406 | dcbz 0,rMEMP | |
407 | cmplwi cr1,rLEN,384 | |
408 | dcbz rMEMP2,rMEMP | |
409 | addi rMEMP,rMEMP,256 | |
410 | addi rLEN,rLEN,-256 | |
411 | bge cr6,L(cacheAligned256) | |
412 | blt cr1,L(cacheAligned1) | |
413 | .align 4 | |
414 | L(cacheAligned128): | |
415 | dcbz 0,rMEMP | |
416 | addi rMEMP,rMEMP,128 | |
417 | addi rLEN,rLEN,-128 | |
418 | .align 4 | |
419 | L(cacheAligned1): | |
420 | cmplwi cr1,rLEN,32 | |
421 | blt cr1,L(handletail32) | |
422 | addi rMEMP3,rMEMP,32 | |
423 | addi rLEN,rLEN,-32 | |
424 | stw rCHR,0(rMEMP) | |
425 | stw rCHR,4(rMEMP) | |
426 | stw rCHR,8(rMEMP) | |
427 | stw rCHR,12(rMEMP) | |
428 | stw rCHR,16(rMEMP) | |
429 | stw rCHR,20(rMEMP) | |
430 | addi rMEMP,rMEMP,32 | |
431 | cmplwi cr1,rLEN,32 | |
432 | stw rCHR,-8(rMEMP3) | |
433 | stw rCHR,-4(rMEMP3) | |
434 | L(cacheAligned2): | |
435 | blt cr1,L(handletail32) | |
436 | addi rLEN,rLEN,-32 | |
437 | stw rCHR,0(rMEMP3) | |
438 | stw rCHR,4(rMEMP3) | |
439 | stw rCHR,8(rMEMP3) | |
440 | stw rCHR,12(rMEMP3) | |
441 | addi rMEMP,rMEMP,32 | |
442 | cmplwi cr1,rLEN,32 | |
443 | stw rCHR,16(rMEMP3) | |
444 | stw rCHR,20(rMEMP3) | |
445 | stw rCHR,24(rMEMP3) | |
446 | stw rCHR,28(rMEMP3) | |
447 | nop | |
448 | L(cacheAligned3): | |
449 | blt cr1,L(handletail32) | |
450 | /* At this point we can overrun the store queue (pipe reject) so it is | |
451 | time to slow things down. The store queue can merge two adjacent | |
452 | stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. | |
453 | So we add "group ending nops" to guarantee that we dispatch only two | |
454 | stores every other cycle. */ | |
455 | ori r1,r1,0 | |
456 | ori r1,r1,0 | |
457 | addi rMEMP,rMEMP,32 | |
458 | addi rLEN,rLEN,-32 | |
459 | ori r1,r1,0 | |
460 | ori r1,r1,0 | |
461 | stw rCHR,32(rMEMP3) | |
462 | stw rCHR,36(rMEMP3) | |
463 | ori r1,r1,0 | |
464 | ori r1,r1,0 | |
465 | stw rCHR,40(rMEMP3) | |
466 | stw rCHR,44(rMEMP3) | |
467 | ori r1,r1,0 | |
468 | ori r1,r1,0 | |
469 | stw rCHR,48(rMEMP3) | |
470 | stw rCHR,52(rMEMP3) | |
471 | ori r1,r1,0 | |
472 | ori r1,r1,0 | |
473 | stw rCHR,56(rMEMP3) | |
474 | stw rCHR,60(rMEMP3) | |
04067002 | 475 | |
a88f47a7 UD |
476 | /* We are here because the length or remainder (rLEN) is less than the |
477 | cache line/sector size and does not justify aggressive loop unrolling. | |
478 | So set up the preconditions for L(medium) and go there. */ | |
04067002 UD |
479 | .align 3 |
480 | L(handletail32): | |
a88f47a7 UD |
481 | cmplwi cr1,rLEN,0 |
482 | beqlr cr1 | |
483 | b L(medium) | |
04067002 | 484 | |
a88f47a7 | 485 | .align 4 |
04067002 UD |
486 | L(small): |
487 | /* Memset of 4 bytes or less. */ | |
488 | cmplwi cr5, rLEN, 1 | |
489 | cmplwi cr1, rLEN, 3 | |
490 | bltlr cr5 | |
491 | stb rCHR, 0(rMEMP) | |
492 | beqlr cr5 | |
493 | stb rCHR, 1(rMEMP) | |
494 | bltlr cr1 | |
495 | stb rCHR, 2(rMEMP) | |
496 | beqlr cr1 | |
497 | stb rCHR, 3(rMEMP) | |
498 | blr | |
499 | ||
500 | /* Memset of 0-31 bytes. */ | |
501 | .align 5 | |
502 | L(medium): | |
503 | cmplwi cr1, rLEN, 16 | |
504 | L(medium_tail2): | |
505 | add rMEMP, rMEMP, rLEN | |
506 | L(medium_tail): | |
507 | bt- 31, L(medium_31t) | |
508 | bt- 30, L(medium_30t) | |
509 | L(medium_30f): | |
a88f47a7 | 510 | bt 29, L(medium_29t) |
04067002 | 511 | L(medium_29f): |
a88f47a7 UD |
512 | bge cr1, L(medium_27t) |
513 | bflr 28 | |
04067002 UD |
514 | stw rCHR, -4(rMEMP) |
515 | stw rCHR, -8(rMEMP) | |
516 | blr | |
517 | ||
518 | L(medium_31t): | |
519 | stbu rCHR, -1(rMEMP) | |
520 | bf- 30, L(medium_30f) | |
521 | L(medium_30t): | |
522 | sthu rCHR, -2(rMEMP) | |
523 | bf- 29, L(medium_29f) | |
524 | L(medium_29t): | |
525 | stwu rCHR, -4(rMEMP) | |
a88f47a7 | 526 | blt cr1, L(medium_27f) |
04067002 UD |
527 | L(medium_27t): |
528 | stw rCHR, -4(rMEMP) | |
529 | stw rCHR, -8(rMEMP) | |
530 | stw rCHR, -12(rMEMP) | |
531 | stwu rCHR, -16(rMEMP) | |
532 | L(medium_27f): | |
a88f47a7 | 533 | bflr 28 |
04067002 UD |
534 | L(medium_28t): |
535 | stw rCHR, -4(rMEMP) | |
536 | stw rCHR, -8(rMEMP) | |
537 | blr | |
b5510883 | 538 | END (memset) |
04067002 | 539 | libc_hidden_builtin_def (memset) |