]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power6/memset.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power6 / memset.S
CommitLineData
a88f47a7 1/* Optimized 32-bit memset implementation for POWER6.
04277e02 2 Copyright (C) 1997-2019 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
f17a4233 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
04067002
UD
22 Returns 's'.
23
24 The memset is done in three sizes: byte (8 bits), word (32 bits),
25 cache line (1024 bits). There is a special case for setting cache lines
26 to 0, to take advantage of the dcbz instruction. */
27
a88f47a7 28 .machine power6
b5510883 29EALIGN (memset, 7, 0)
04067002
UD
30 CALL_MCOUNT
31
32#define rTMP r0
33#define rRTN r3 /* Initial value of 1st argument. */
34#define rMEMP0 r3 /* Original value of 1st arg. */
35#define rCHR r4 /* Char to set in each byte. */
36#define rLEN r5 /* Length of region to set. */
37#define rMEMP r6 /* Address at which we are storing. */
38#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
39#define rMEMP2 r8
40
41#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
a88f47a7 42#define rMEMP3 r9 /* Alt mem pointer. */
04067002
UD
43L(_memset):
44/* Take care of case for size <= 4. */
45 cmplwi cr1, rLEN, 4
46 andi. rALIGN, rMEMP0, 3
47 mr rMEMP, rMEMP0
48 ble- cr1, L(small)
04067002
UD
49/* Align to word boundary. */
50 cmplwi cr5, rLEN, 31
d298c416 51 insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */
04067002
UD
52 beq+ L(aligned)
53 mtcrf 0x01, rMEMP0
54 subfic rALIGN, rALIGN, 4
55 add rMEMP, rMEMP, rALIGN
56 sub rLEN, rLEN, rALIGN
57 bf+ 31, L(g0)
58 stb rCHR, 0(rMEMP0)
59 bt 30, L(aligned)
60L(g0):
61 sth rCHR, -2(rMEMP)
62
63 .align 4
64/* Handle the case of size < 31. */
65L(aligned):
66 mtcrf 0x01, rLEN
d298c416 67 insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */
04067002
UD
68 ble cr5, L(medium)
69/* Align to 32-byte boundary. */
70 andi. rALIGN, rMEMP, 0x1C
71 subfic rALIGN, rALIGN, 0x20
72 beq L(caligned)
73 mtcrf 0x01, rALIGN
74 add rMEMP, rMEMP, rALIGN
75 sub rLEN, rLEN, rALIGN
76 cmplwi cr1, rALIGN, 0x10
77 mr rMEMP2, rMEMP
78 bf 28, L(a1)
79 stw rCHR, -4(rMEMP2)
80 stwu rCHR, -8(rMEMP2)
a88f47a7 81 nop
04067002
UD
82L(a1): blt cr1, L(a2)
83 stw rCHR, -4(rMEMP2)
84 stw rCHR, -8(rMEMP2)
85 stw rCHR, -12(rMEMP2)
86 stwu rCHR, -16(rMEMP2)
87L(a2): bf 29, L(caligned)
88 stw rCHR, -4(rMEMP2)
89
a88f47a7 90 .align 3
04067002
UD
91/* Now aligned to a 32 byte boundary. */
92L(caligned):
93 cmplwi cr1, rCHR, 0
94 clrrwi. rALIGN, rLEN, 5
95 mtcrf 0x01, rLEN
96 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
97L(nondcbz):
04067002 98 beq L(medium) /* We may not actually get to do a full line. */
a88f47a7
UD
99 nop
100/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
101 boundary may not be at cache line (128-byte) boundary. */
102L(nzloopstart):
103/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08 104 If rLEN is less than the distance to the next cache-line boundary use
a88f47a7
UD
105 cacheAligned1 code to finish the tail. */
106 cmplwi cr1,rLEN,128
107
108 andi. rTMP,rMEMP,127
109 blt cr1,L(cacheAligned1)
110 addi rMEMP3,rMEMP,32
111 beq L(nzCacheAligned)
112 addi rLEN,rLEN,-32
113 stw rCHR,0(rMEMP)
114 stw rCHR,4(rMEMP)
115 stw rCHR,8(rMEMP)
116 stw rCHR,12(rMEMP)
117 stw rCHR,16(rMEMP)
118 stw rCHR,20(rMEMP)
119 addi rMEMP,rMEMP,32
120 andi. rTMP,rMEMP3,127
121 stw rCHR,-8(rMEMP3)
122 stw rCHR,-4(rMEMP3)
04067002 123
a88f47a7
UD
124 beq L(nzCacheAligned)
125 addi rLEN,rLEN,-32
126 stw rCHR,0(rMEMP3)
127 stw rCHR,4(rMEMP3)
128 addi rMEMP,rMEMP,32
129 stw rCHR,8(rMEMP3)
130 stw rCHR,12(rMEMP3)
131 andi. rTMP,rMEMP,127
132 stw rCHR,16(rMEMP3)
133 stw rCHR,20(rMEMP3)
134 stw rCHR,24(rMEMP3)
135 stw rCHR,28(rMEMP3)
136
137 beq L(nzCacheAligned)
138 addi rLEN,rLEN,-32
139/* At this point we can overrun the store queue (pipe reject) so it is
140 time to slow things down. The store queue can merge two adjacent
141 stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
142 So we add "group ending nops" to guarantee that we dispatch only two
143 stores every other cycle. */
144 ori r1,r1,0
145 ori r1,r1,0
146 stw rCHR,32(rMEMP3)
147 stw rCHR,36(rMEMP3)
148 addi rMEMP,rMEMP,32
149 cmplwi cr1,rLEN,128
150 ori r1,r1,0
151 ori r1,r1,0
152 stw rCHR,40(rMEMP3)
153 stw rCHR,44(rMEMP3)
154 ori r1,r1,0
155 ori r1,r1,0
156 stw rCHR,48(rMEMP3)
157 stw rCHR,52(rMEMP3)
158 ori r1,r1,0
159 ori r1,r1,0
160 stw rCHR,56(rMEMP3)
161 stw rCHR,60(rMEMP3)
162 blt cr1,L(cacheAligned1)
163 b L(nzCacheAligned)
164
165/* Now we are aligned to the cache line and can use dcbtst. */
166 .align 5
167L(nzCacheAligned):
168 cmplwi cr1,rLEN,128
169 cmplwi cr6,rLEN,256
170 blt cr1,L(cacheAligned1)
171 blt cr6,L(nzCacheAligned128)
04067002 172 .align 4
a88f47a7
UD
173L(nzCacheAligned128):
174 nop
175 addi rMEMP3,rMEMP,64
176 stw rCHR,0(rMEMP)
177 stw rCHR,4(rMEMP)
178 stw rCHR,8(rMEMP)
179 stw rCHR,12(rMEMP)
180 stw rCHR,16(rMEMP)
181 stw rCHR,20(rMEMP)
182 stw rCHR,24(rMEMP)
183 stw rCHR,28(rMEMP)
184 stw rCHR,32(rMEMP)
185 stw rCHR,36(rMEMP)
186 stw rCHR,40(rMEMP)
187 stw rCHR,44(rMEMP)
188 stw rCHR,48(rMEMP)
189 stw rCHR,52(rMEMP)
190 stw rCHR,56(rMEMP)
191 stw rCHR,60(rMEMP)
192 addi rMEMP,rMEMP3,64
193 addi rLEN,rLEN,-128
194/* At this point we can overrun the store queue (pipe reject) so it is
195 time to slow things down. The store queue can merge two adjacent
196 stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
197 So we add "group ending nops" to guarantee that we dispatch only one
198 store per cycle. */
199 stw rCHR,0(rMEMP3)
200 ori r1,r1,0
201 stw rCHR,4(rMEMP3)
202 ori r1,r1,0
203 stw rCHR,8(rMEMP3)
204 ori r1,r1,0
205 stw rCHR,12(rMEMP3)
206 ori r1,r1,0
207 stw rCHR,16(rMEMP3)
208 ori r1,r1,0
209 stw rCHR,20(rMEMP3)
210 ori r1,r1,0
211 stw rCHR,24(rMEMP3)
212 ori r1,r1,0
213 stw rCHR,28(rMEMP3)
214 ori r1,r1,0
215 stw rCHR,32(rMEMP3)
216 ori r1,r1,0
217 stw rCHR,36(rMEMP3)
218 ori r1,r1,0
219 stw rCHR,40(rMEMP3)
220 ori r1,r1,0
221 stw rCHR,44(rMEMP3)
222 ori r1,r1,0
223 stw rCHR,48(rMEMP3)
224 ori r1,r1,0
225 stw rCHR,52(rMEMP3)
226 ori r1,r1,0
227 stw rCHR,56(rMEMP3)
228 ori r1,r1,0
229 stw rCHR,60(rMEMP3)
230 blt cr6,L(cacheAligned1)
4f41c682 231#if IS_IN (libc)
a88f47a7
UD
232 lfd 0,-128(rMEMP)
233#endif
234 b L(nzCacheAligned256)
235 .align 5
236L(nzCacheAligned256):
237 cmplwi cr1,rLEN,256
238 addi rMEMP3,rMEMP,64
4f41c682 239#if !IS_IN (libc)
25bfbb9e 240/* When we are not in libc we should use only GPRs to avoid the FPU lock
a88f47a7
UD
241 interrupt. */
242 stw rCHR,0(rMEMP)
243 stw rCHR,4(rMEMP)
244 stw rCHR,8(rMEMP)
245 stw rCHR,12(rMEMP)
246 stw rCHR,16(rMEMP)
247 stw rCHR,20(rMEMP)
248 stw rCHR,24(rMEMP)
249 stw rCHR,28(rMEMP)
250 stw rCHR,32(rMEMP)
251 stw rCHR,36(rMEMP)
252 stw rCHR,40(rMEMP)
253 stw rCHR,44(rMEMP)
254 stw rCHR,48(rMEMP)
255 stw rCHR,52(rMEMP)
256 stw rCHR,56(rMEMP)
257 stw rCHR,60(rMEMP)
258 addi rMEMP,rMEMP3,64
259 addi rLEN,rLEN,-128
260 stw rCHR,0(rMEMP3)
261 stw rCHR,4(rMEMP3)
262 stw rCHR,8(rMEMP3)
263 stw rCHR,12(rMEMP3)
264 stw rCHR,16(rMEMP3)
265 stw rCHR,20(rMEMP3)
266 stw rCHR,24(rMEMP3)
267 stw rCHR,28(rMEMP3)
268 stw rCHR,32(rMEMP3)
269 stw rCHR,36(rMEMP3)
270 stw rCHR,40(rMEMP3)
271 stw rCHR,44(rMEMP3)
272 stw rCHR,48(rMEMP3)
273 stw rCHR,52(rMEMP3)
274 stw rCHR,56(rMEMP3)
275 stw rCHR,60(rMEMP3)
276#else
277/* We are in libc and this is a long memset so we can use FPRs and can afford
278 occasional FPU locked interrupts. */
279 stfd 0,0(rMEMP)
280 stfd 0,8(rMEMP)
281 stfd 0,16(rMEMP)
282 stfd 0,24(rMEMP)
283 stfd 0,32(rMEMP)
284 stfd 0,40(rMEMP)
285 stfd 0,48(rMEMP)
286 stfd 0,56(rMEMP)
287 addi rMEMP,rMEMP3,64
288 addi rLEN,rLEN,-128
289 stfd 0,0(rMEMP3)
290 stfd 0,8(rMEMP3)
291 stfd 0,16(rMEMP3)
292 stfd 0,24(rMEMP3)
293 stfd 0,32(rMEMP3)
294 stfd 0,40(rMEMP3)
295 stfd 0,48(rMEMP3)
296 stfd 0,56(rMEMP3)
297#endif
298 bge cr1,L(nzCacheAligned256)
299 dcbtst 0,rMEMP
300 b L(cacheAligned1)
04067002 301
a88f47a7
UD
302 .align 4
303/* Storing a zero "c" value. We are aligned at a sector (32-byte)
304 boundary but may not be at cache line (128-byte) boundary. If the
305 remaining length spans a full cache line we can use the Data cache
306 block zero instruction. */
04067002 307L(zloopstart):
a88f47a7 308/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08 309 If rLEN is less than the distance to the next cache-line boundary use
a88f47a7
UD
310 cacheAligned1 code to finish the tail. */
311 cmplwi cr1,rLEN,128
04067002 312 beq L(medium)
04067002 313L(getCacheAligned):
04067002 314 andi. rTMP,rMEMP,127
a88f47a7
UD
315 blt cr1,L(cacheAligned1)
316 addi rMEMP3,rMEMP,32
317 beq L(cacheAligned)
318 addi rLEN,rLEN,-32
319 stw rCHR,0(rMEMP)
320 stw rCHR,4(rMEMP)
321 stw rCHR,8(rMEMP)
322 stw rCHR,12(rMEMP)
323 stw rCHR,16(rMEMP)
324 stw rCHR,20(rMEMP)
325 addi rMEMP,rMEMP,32
326 andi. rTMP,rMEMP3,127
327 stw rCHR,-8(rMEMP3)
328 stw rCHR,-4(rMEMP3)
329L(getCacheAligned2):
04067002 330 beq L(cacheAligned)
a88f47a7 331 addi rLEN,rLEN,-32
04067002 332 addi rMEMP,rMEMP,32
a88f47a7
UD
333 stw rCHR,0(rMEMP3)
334 stw rCHR,4(rMEMP3)
335 stw rCHR,8(rMEMP3)
336 stw rCHR,12(rMEMP3)
337 andi. rTMP,rMEMP,127
338 nop
339 stw rCHR,16(rMEMP3)
340 stw rCHR,20(rMEMP3)
341 stw rCHR,24(rMEMP3)
342 stw rCHR,28(rMEMP3)
343L(getCacheAligned3):
344 beq L(cacheAligned)
345/* At this point we can overrun the store queue (pipe reject) so it is
346 time to slow things down. The store queue can merge two adjacent
347 stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
348 So we add "group ending nops" to guarantee that we dispatch only two
349 stores every other cycle. */
04067002 350 addi rLEN,rLEN,-32
a88f47a7
UD
351 ori r1,r1,0
352 ori r1,r1,0
353 stw rCHR,32(rMEMP3)
354 stw rCHR,36(rMEMP3)
355 addi rMEMP,rMEMP,32
356 cmplwi cr1,rLEN,128
357 ori r1,r1,0
358 stw rCHR,40(rMEMP3)
359 stw rCHR,44(rMEMP3)
360 cmplwi cr6,rLEN,256
361 li rMEMP2,128
362 ori r1,r1,0
363 stw rCHR,48(rMEMP3)
364 stw rCHR,52(rMEMP3)
365 ori r1,r1,0
366 ori r1,r1,0
367 stw rCHR,56(rMEMP3)
368 stw rCHR,60(rMEMP3)
369 blt cr1,L(cacheAligned1)
370 blt cr6,L(cacheAligned128)
371 b L(cacheAlignedx)
04067002
UD
372
373/* Now we are aligned to the cache line and can use dcbz. */
374 .align 4
375L(cacheAligned):
a88f47a7
UD
376 cmplwi cr1,rLEN,128
377 cmplwi cr6,rLEN,256
378 blt cr1,L(cacheAligned1)
379 li rMEMP2,128
380L(cacheAlignedx):
25bfbb9e 381 cmplwi cr5,rLEN,640
a88f47a7
UD
382 blt cr6,L(cacheAligned128)
383 bgt cr5,L(cacheAligned512)
384 cmplwi cr6,rLEN,512
04067002 385 dcbz 0,rMEMP
a88f47a7
UD
386 cmplwi cr1,rLEN,384
387 dcbz rMEMP2,rMEMP
388 addi rMEMP,rMEMP,256
389 addi rLEN,rLEN,-256
390 blt cr1,L(cacheAligned1)
391 blt cr6,L(cacheAligned128)
392 b L(cacheAligned256)
393 .align 5
394/* A simple loop for the longer (>640 bytes) lengths. This form limits
395 the branch miss-predicted to exactly 1 at loop exit.*/
396L(cacheAligned512):
78b7adba 397 cmplwi cr1,rLEN,128
a88f47a7
UD
398 blt cr1,L(cacheAligned1)
399 dcbz 0,rMEMP
400 addi rLEN,rLEN,-128
401 addi rMEMP,rMEMP,128
402 b L(cacheAligned512)
403 .align 5
404L(cacheAligned256):
405 cmplwi cr6,rLEN,512
406 dcbz 0,rMEMP
407 cmplwi cr1,rLEN,384
408 dcbz rMEMP2,rMEMP
409 addi rMEMP,rMEMP,256
410 addi rLEN,rLEN,-256
411 bge cr6,L(cacheAligned256)
412 blt cr1,L(cacheAligned1)
413 .align 4
414L(cacheAligned128):
415 dcbz 0,rMEMP
416 addi rMEMP,rMEMP,128
417 addi rLEN,rLEN,-128
418 .align 4
419L(cacheAligned1):
420 cmplwi cr1,rLEN,32
421 blt cr1,L(handletail32)
422 addi rMEMP3,rMEMP,32
423 addi rLEN,rLEN,-32
424 stw rCHR,0(rMEMP)
425 stw rCHR,4(rMEMP)
426 stw rCHR,8(rMEMP)
427 stw rCHR,12(rMEMP)
428 stw rCHR,16(rMEMP)
429 stw rCHR,20(rMEMP)
430 addi rMEMP,rMEMP,32
431 cmplwi cr1,rLEN,32
432 stw rCHR,-8(rMEMP3)
433 stw rCHR,-4(rMEMP3)
434L(cacheAligned2):
435 blt cr1,L(handletail32)
436 addi rLEN,rLEN,-32
437 stw rCHR,0(rMEMP3)
438 stw rCHR,4(rMEMP3)
439 stw rCHR,8(rMEMP3)
440 stw rCHR,12(rMEMP3)
441 addi rMEMP,rMEMP,32
442 cmplwi cr1,rLEN,32
443 stw rCHR,16(rMEMP3)
444 stw rCHR,20(rMEMP3)
445 stw rCHR,24(rMEMP3)
446 stw rCHR,28(rMEMP3)
447 nop
448L(cacheAligned3):
449 blt cr1,L(handletail32)
450/* At this point we can overrun the store queue (pipe reject) so it is
451 time to slow things down. The store queue can merge two adjacent
452 stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
453 So we add "group ending nops" to guarantee that we dispatch only two
454 stores every other cycle. */
455 ori r1,r1,0
456 ori r1,r1,0
457 addi rMEMP,rMEMP,32
458 addi rLEN,rLEN,-32
459 ori r1,r1,0
460 ori r1,r1,0
461 stw rCHR,32(rMEMP3)
462 stw rCHR,36(rMEMP3)
463 ori r1,r1,0
464 ori r1,r1,0
465 stw rCHR,40(rMEMP3)
466 stw rCHR,44(rMEMP3)
467 ori r1,r1,0
468 ori r1,r1,0
469 stw rCHR,48(rMEMP3)
470 stw rCHR,52(rMEMP3)
471 ori r1,r1,0
472 ori r1,r1,0
473 stw rCHR,56(rMEMP3)
474 stw rCHR,60(rMEMP3)
04067002 475
a88f47a7
UD
476/* We are here because the length or remainder (rLEN) is less than the
477 cache line/sector size and does not justify aggressive loop unrolling.
478 So set up the preconditions for L(medium) and go there. */
04067002
UD
479 .align 3
480L(handletail32):
a88f47a7
UD
481 cmplwi cr1,rLEN,0
482 beqlr cr1
483 b L(medium)
04067002 484
a88f47a7 485 .align 4
04067002
UD
486L(small):
487/* Memset of 4 bytes or less. */
488 cmplwi cr5, rLEN, 1
489 cmplwi cr1, rLEN, 3
490 bltlr cr5
491 stb rCHR, 0(rMEMP)
492 beqlr cr5
493 stb rCHR, 1(rMEMP)
494 bltlr cr1
495 stb rCHR, 2(rMEMP)
496 beqlr cr1
497 stb rCHR, 3(rMEMP)
498 blr
499
500/* Memset of 0-31 bytes. */
501 .align 5
502L(medium):
503 cmplwi cr1, rLEN, 16
504L(medium_tail2):
505 add rMEMP, rMEMP, rLEN
506L(medium_tail):
507 bt- 31, L(medium_31t)
508 bt- 30, L(medium_30t)
509L(medium_30f):
a88f47a7 510 bt 29, L(medium_29t)
04067002 511L(medium_29f):
a88f47a7
UD
512 bge cr1, L(medium_27t)
513 bflr 28
04067002
UD
514 stw rCHR, -4(rMEMP)
515 stw rCHR, -8(rMEMP)
516 blr
517
518L(medium_31t):
519 stbu rCHR, -1(rMEMP)
520 bf- 30, L(medium_30f)
521L(medium_30t):
522 sthu rCHR, -2(rMEMP)
523 bf- 29, L(medium_29f)
524L(medium_29t):
525 stwu rCHR, -4(rMEMP)
a88f47a7 526 blt cr1, L(medium_27f)
04067002
UD
527L(medium_27t):
528 stw rCHR, -4(rMEMP)
529 stw rCHR, -8(rMEMP)
530 stw rCHR, -12(rMEMP)
531 stwu rCHR, -16(rMEMP)
532L(medium_27f):
a88f47a7 533 bflr 28
04067002
UD
534L(medium_28t):
535 stw rCHR, -4(rMEMP)
536 stw rCHR, -8(rMEMP)
537 blr
b5510883 538END (memset)
04067002 539libc_hidden_builtin_def (memset)