[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power6 / memset.S

/* Optimized 32-bit memset implementation for POWER6.
   Copyright (C) 1997-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
   Returns 's'.

   The memset is done in three sizes: byte (8 bits), word (32 bits),
   cache line (1024 bits). There is a special case for setting cache lines
   to 0, to take advantage of the dcbz instruction.  */

	.machine power6
EALIGN (memset, 7, 0)
	CALL_MCOUNT

#define rTMP	r0
#define rRTN	r3	/* Initial value of 1st argument.  */
#define rMEMP0	r3	/* Original value of 1st arg.  */
#define rCHR	r4	/* Char to set in each byte.  */
#define rLEN	r5	/* Length of region to set.  */
#define rMEMP	r6	/* Address at which we are storing.  */
#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
#define rMEMP2	r8

#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
#define rMEMP3	r9	/* Alt mem pointer.  */
L(_memset):
/* Take care of case for size <= 4.  */
	cmplwi	cr1, rLEN, 4
	andi.	rALIGN, rMEMP0, 3
	mr	rMEMP, rMEMP0
	ble-	cr1, L(small)
/* Align to word boundary.  */
	cmplwi	cr5, rLEN, 31
	insrwi	rCHR, rCHR, 8, 16	/* Replicate byte to halfword.  */
	beq+	L(aligned)
	mtcrf	0x01, rMEMP0
	subfic	rALIGN, rALIGN, 4
	add	rMEMP, rMEMP, rALIGN
	sub	rLEN, rLEN, rALIGN
	bf+	31, L(g0)
	stb	rCHR, 0(rMEMP0)
	bt	30, L(aligned)
L(g0):
	sth	rCHR, -2(rMEMP)

        .align 4
/* Handle the case of size < 31.  */
L(aligned):
	mtcrf	0x01, rLEN
	insrwi	rCHR, rCHR, 16, 0	/* Replicate halfword to word.  */
	ble	cr5, L(medium)
/* Align to 32-byte boundary.  */
	andi.	rALIGN, rMEMP, 0x1C
	subfic	rALIGN, rALIGN, 0x20
	beq	L(caligned)
	mtcrf	0x01, rALIGN
	add	rMEMP, rMEMP, rALIGN
	sub	rLEN, rLEN, rALIGN
	cmplwi	cr1, rALIGN, 0x10
	mr	rMEMP2, rMEMP
	bf	28, L(a1)
        stw     rCHR, -4(rMEMP2)
	stwu	rCHR, -8(rMEMP2)
	nop
L(a1):	blt	cr1, L(a2)
        stw     rCHR, -4(rMEMP2)
	stw	rCHR, -8(rMEMP2)
	stw	rCHR, -12(rMEMP2)
	stwu	rCHR, -16(rMEMP2)
L(a2):  bf      29, L(caligned)
        stw     rCHR, -4(rMEMP2)

        .align 3
/* Now aligned to a 32 byte boundary.  */
L(caligned):
	cmplwi	cr1, rCHR, 0
	clrrwi.	rALIGN, rLEN, 5
	mtcrf	0x01, rLEN
	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
L(nondcbz):
	beq	L(medium)	/* We may not actually get to do a full line.  */
	nop
/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
   boundary may not be at cache line (128-byte) boundary.  */
L(nzloopstart):
/* memset in 32-byte chunks until we get to a cache line boundary.
   If rLEN is less than the distance to the next cache-line boundary use
   cacheAligned1 code to finish the tail.  */
	cmplwi	cr1,rLEN,128

	andi.	rTMP,rMEMP,127
	blt	cr1,L(cacheAligned1)
	addi	rMEMP3,rMEMP,32
	beq	L(nzCacheAligned)
	addi	rLEN,rLEN,-32
	stw	rCHR,0(rMEMP)
        stw     rCHR,4(rMEMP)
	stw	rCHR,8(rMEMP)
	stw     rCHR,12(rMEMP)
	stw	rCHR,16(rMEMP)
        stw     rCHR,20(rMEMP)
	addi	rMEMP,rMEMP,32
	andi.	rTMP,rMEMP3,127
	stw	rCHR,-8(rMEMP3)
        stw     rCHR,-4(rMEMP3)

	beq	L(nzCacheAligned)
	addi	rLEN,rLEN,-32
	stw	rCHR,0(rMEMP3)
        stw     rCHR,4(rMEMP3)
	addi	rMEMP,rMEMP,32
	stw	rCHR,8(rMEMP3)
	stw     rCHR,12(rMEMP3)
	andi.	rTMP,rMEMP,127
	stw	rCHR,16(rMEMP3)
        stw     rCHR,20(rMEMP3)
	stw	rCHR,24(rMEMP3)
        stw     rCHR,28(rMEMP3)

	beq	L(nzCacheAligned)
	addi	rLEN,rLEN,-32
/* At this point we can overrun the store queue (pipe reject) so it is
   time to slow things down. The store queue can merge two adjacent
   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
   So we add "group ending nops" to guarantee that we dispatch only two
   stores every other cycle. */
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,32(rMEMP3)
        stw     rCHR,36(rMEMP3)
	addi	rMEMP,rMEMP,32
	cmplwi	cr1,rLEN,128
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,40(rMEMP3)
	stw     rCHR,44(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,48(rMEMP3)
        stw     rCHR,52(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,56(rMEMP3)
        stw     rCHR,60(rMEMP3)
	blt	cr1,L(cacheAligned1)
	b	L(nzCacheAligned)

/* Now we are aligned to the cache line and can use dcbtst.  */
        .align 5
L(nzCacheAligned):
	cmplwi	cr1,rLEN,128
	cmplwi	cr6,rLEN,256
	blt	cr1,L(cacheAligned1)
	blt	cr6,L(nzCacheAligned128)
        .align 4
L(nzCacheAligned128):
	nop
	addi	rMEMP3,rMEMP,64
	stw	rCHR,0(rMEMP)
        stw     rCHR,4(rMEMP)
	stw	rCHR,8(rMEMP)
	stw     rCHR,12(rMEMP)
	stw	rCHR,16(rMEMP)
        stw     rCHR,20(rMEMP)
	stw	rCHR,24(rMEMP)
        stw     rCHR,28(rMEMP)
	stw	rCHR,32(rMEMP)
        stw     rCHR,36(rMEMP)
	stw	rCHR,40(rMEMP)
	stw     rCHR,44(rMEMP)
	stw	rCHR,48(rMEMP)
        stw     rCHR,52(rMEMP)
	stw	rCHR,56(rMEMP)
        stw     rCHR,60(rMEMP)
	addi	rMEMP,rMEMP3,64
	addi	rLEN,rLEN,-128
/* At this point we can overrun the store queue (pipe reject) so it is
   time to slow things down. The store queue can merge two adjacent
   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
   So we add "group ending nops" to guarantee that we dispatch only one
   store per cycle. */
	stw	rCHR,0(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,4(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,8(rMEMP3)
	ori	r1,r1,0
	stw     rCHR,12(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,16(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,20(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,24(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,28(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,32(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,36(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,40(rMEMP3)
	ori	r1,r1,0
	stw     rCHR,44(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,48(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,52(rMEMP3)
	ori	r1,r1,0
	stw	rCHR,56(rMEMP3)
	ori	r1,r1,0
        stw     rCHR,60(rMEMP3)
	blt	cr6,L(cacheAligned1)
#if IS_IN (libc)
	lfd	0,-128(rMEMP)
#endif
	b	L(nzCacheAligned256)
        .align 5
L(nzCacheAligned256):
	cmplwi	cr1,rLEN,256
	addi	rMEMP3,rMEMP,64
#if !IS_IN (libc)
/* When we are not in libc we should use only GPRs to avoid the FPU lock
   interrupt.  */
	stw	rCHR,0(rMEMP)
        stw     rCHR,4(rMEMP)
	stw	rCHR,8(rMEMP)
	stw     rCHR,12(rMEMP)
	stw	rCHR,16(rMEMP)
        stw     rCHR,20(rMEMP)
	stw	rCHR,24(rMEMP)
        stw     rCHR,28(rMEMP)
	stw	rCHR,32(rMEMP)
        stw     rCHR,36(rMEMP)
	stw	rCHR,40(rMEMP)
	stw     rCHR,44(rMEMP)
	stw	rCHR,48(rMEMP)
        stw     rCHR,52(rMEMP)
	stw	rCHR,56(rMEMP)
        stw     rCHR,60(rMEMP)
	addi	rMEMP,rMEMP3,64
	addi	rLEN,rLEN,-128
	stw	rCHR,0(rMEMP3)
        stw     rCHR,4(rMEMP3)
	stw	rCHR,8(rMEMP3)
	stw     rCHR,12(rMEMP3)
	stw	rCHR,16(rMEMP3)
        stw     rCHR,20(rMEMP3)
	stw	rCHR,24(rMEMP3)
        stw     rCHR,28(rMEMP3)
	stw	rCHR,32(rMEMP3)
        stw     rCHR,36(rMEMP3)
	stw	rCHR,40(rMEMP3)
	stw     rCHR,44(rMEMP3)
	stw	rCHR,48(rMEMP3)
        stw     rCHR,52(rMEMP3)
	stw	rCHR,56(rMEMP3)
        stw     rCHR,60(rMEMP3)
#else
/* We are in libc and this is a long memset so we can use FPRs and can afford
   occasional FPU locked interrupts.  */
	stfd	0,0(rMEMP)
	stfd	0,8(rMEMP)
	stfd	0,16(rMEMP)
	stfd	0,24(rMEMP)
	stfd	0,32(rMEMP)
	stfd	0,40(rMEMP)
	stfd	0,48(rMEMP)
	stfd	0,56(rMEMP)
	addi	rMEMP,rMEMP3,64
	addi	rLEN,rLEN,-128
	stfd	0,0(rMEMP3)
	stfd	0,8(rMEMP3)
	stfd	0,16(rMEMP3)
	stfd	0,24(rMEMP3)
	stfd	0,32(rMEMP3)
	stfd	0,40(rMEMP3)
	stfd	0,48(rMEMP3)
	stfd	0,56(rMEMP3)
#endif
	bge	cr1,L(nzCacheAligned256)
	dcbtst	0,rMEMP
	b	L(cacheAligned1)

	.align 4
/* Storing a zero "c" value. We are aligned at a sector (32-byte)
   boundary but may not be at cache line (128-byte) boundary.  If the
   remaining length spans a full cache line we can use the Data cache
   block zero instruction. */
L(zloopstart):
/* memset in 32-byte chunks until we get to a cache line boundary.
   If rLEN is less than the distance to the next cache-line boundary use
   cacheAligned1 code to finish the tail.  */
	cmplwi	cr1,rLEN,128
	beq	L(medium)
L(getCacheAligned):
	andi.	rTMP,rMEMP,127
	blt	cr1,L(cacheAligned1)
	addi	rMEMP3,rMEMP,32
	beq	L(cacheAligned)
	addi	rLEN,rLEN,-32
	stw	rCHR,0(rMEMP)
        stw     rCHR,4(rMEMP)
	stw	rCHR,8(rMEMP)
	stw     rCHR,12(rMEMP)
	stw	rCHR,16(rMEMP)
        stw     rCHR,20(rMEMP)
	addi	rMEMP,rMEMP,32
	andi.	rTMP,rMEMP3,127
	stw	rCHR,-8(rMEMP3)
        stw     rCHR,-4(rMEMP3)
L(getCacheAligned2):
	beq	L(cacheAligned)
	addi	rLEN,rLEN,-32
	addi	rMEMP,rMEMP,32
	stw	rCHR,0(rMEMP3)
        stw     rCHR,4(rMEMP3)
	stw	rCHR,8(rMEMP3)
	stw     rCHR,12(rMEMP3)
	andi.	rTMP,rMEMP,127
	nop
	stw	rCHR,16(rMEMP3)
        stw     rCHR,20(rMEMP3)
	stw	rCHR,24(rMEMP3)
        stw     rCHR,28(rMEMP3)
L(getCacheAligned3):
	beq	L(cacheAligned)
/* At this point we can overrun the store queue (pipe reject) so it is
   time to slow things down. The store queue can merge two adjacent
   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
   So we add "group ending nops" to guarantee that we dispatch only two
   stores every other cycle. */
	addi	rLEN,rLEN,-32
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,32(rMEMP3)
        stw     rCHR,36(rMEMP3)
	addi	rMEMP,rMEMP,32
	cmplwi	cr1,rLEN,128
	ori	r1,r1,0
	stw	rCHR,40(rMEMP3)
	stw     rCHR,44(rMEMP3)
	cmplwi	cr6,rLEN,256
	li	rMEMP2,128
	ori	r1,r1,0
	stw	rCHR,48(rMEMP3)
        stw     rCHR,52(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,56(rMEMP3)
        stw     rCHR,60(rMEMP3)
	blt	cr1,L(cacheAligned1)
	blt	cr6,L(cacheAligned128)
	b	L(cacheAlignedx)

/* Now we are aligned to the cache line and can use dcbz.  */
        .align 4
L(cacheAligned):
	cmplwi	cr1,rLEN,128
	cmplwi	cr6,rLEN,256
	blt	cr1,L(cacheAligned1)
	li	rMEMP2,128
L(cacheAlignedx):
	cmplwi	cr5,rLEN,640
	blt	cr6,L(cacheAligned128)
	bgt	cr5,L(cacheAligned512)
	cmplwi	cr6,rLEN,512
	dcbz	0,rMEMP
	cmplwi	cr1,rLEN,384
	dcbz	rMEMP2,rMEMP
	addi	rMEMP,rMEMP,256
	addi	rLEN,rLEN,-256
	blt	cr1,L(cacheAligned1)
	blt	cr6,L(cacheAligned128)
	b	L(cacheAligned256)
	.align 5
/* A simple loop for the longer (>640 bytes) lengths.  This form limits
   the branch miss-predicted to exactly 1 at loop exit.*/
L(cacheAligned512):
	cmplwi	cr1,rLEN,128
	blt	cr1,L(cacheAligned1)
	dcbz	0,rMEMP
	addi	rLEN,rLEN,-128
	addi	rMEMP,rMEMP,128
	b	L(cacheAligned512)
        .align 5
L(cacheAligned256):
	cmplwi	cr6,rLEN,512
	dcbz	0,rMEMP
	cmplwi	cr1,rLEN,384
	dcbz	rMEMP2,rMEMP
	addi	rMEMP,rMEMP,256
	addi	rLEN,rLEN,-256
	bge	cr6,L(cacheAligned256)
	blt	cr1,L(cacheAligned1)
        .align 4
L(cacheAligned128):
	dcbz	0,rMEMP
	addi	rMEMP,rMEMP,128
	addi	rLEN,rLEN,-128
        .align 4
L(cacheAligned1):
	cmplwi	cr1,rLEN,32
	blt	cr1,L(handletail32)
	addi	rMEMP3,rMEMP,32
	addi	rLEN,rLEN,-32
	stw	rCHR,0(rMEMP)
        stw     rCHR,4(rMEMP)
	stw	rCHR,8(rMEMP)
	stw     rCHR,12(rMEMP)
	stw	rCHR,16(rMEMP)
        stw     rCHR,20(rMEMP)
	addi	rMEMP,rMEMP,32
	cmplwi	cr1,rLEN,32
	stw	rCHR,-8(rMEMP3)
        stw     rCHR,-4(rMEMP3)
L(cacheAligned2):
	blt	cr1,L(handletail32)
	addi	rLEN,rLEN,-32
	stw	rCHR,0(rMEMP3)
        stw     rCHR,4(rMEMP3)
	stw	rCHR,8(rMEMP3)
	stw     rCHR,12(rMEMP3)
	addi	rMEMP,rMEMP,32
	cmplwi	cr1,rLEN,32
	stw	rCHR,16(rMEMP3)
        stw     rCHR,20(rMEMP3)
	stw	rCHR,24(rMEMP3)
        stw     rCHR,28(rMEMP3)
	nop
L(cacheAligned3):
	blt	cr1,L(handletail32)
/* At this point we can overrun the store queue (pipe reject) so it is
   time to slow things down. The store queue can merge two adjacent
   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
   So we add "group ending nops" to guarantee that we dispatch only two
   stores every other cycle. */
	ori	r1,r1,0
	ori	r1,r1,0
	addi	rMEMP,rMEMP,32
	addi	rLEN,rLEN,-32
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,32(rMEMP3)
        stw     rCHR,36(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,40(rMEMP3)
	stw     rCHR,44(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,48(rMEMP3)
        stw     rCHR,52(rMEMP3)
	ori	r1,r1,0
	ori	r1,r1,0
	stw	rCHR,56(rMEMP3)
        stw     rCHR,60(rMEMP3)

/* We are here because the length or remainder (rLEN) is less than the
   cache line/sector size and does not justify aggressive loop unrolling.
   So set up the preconditions for L(medium) and go there.  */
        .align 3
L(handletail32):
	cmplwi	cr1,rLEN,0
	beqlr   cr1
	b	L(medium)

	.align 4
L(small):
/* Memset of 4 bytes or less.  */
	cmplwi	cr5, rLEN, 1
	cmplwi	cr1, rLEN, 3
	bltlr	cr5
	stb	rCHR, 0(rMEMP)
	beqlr	cr5
	stb	rCHR, 1(rMEMP)
	bltlr	cr1
	stb	rCHR, 2(rMEMP)
	beqlr	cr1
	stb	rCHR, 3(rMEMP)
	blr

/* Memset of 0-31 bytes.  */
	.align 5
L(medium):
	cmplwi	cr1, rLEN, 16
L(medium_tail2):
	add	rMEMP, rMEMP, rLEN
L(medium_tail):
	bt-	31, L(medium_31t)
	bt-	30, L(medium_30t)
L(medium_30f):
	bt	29, L(medium_29t)
L(medium_29f):
	bge	cr1, L(medium_27t)
	bflr	28
        stw     rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	blr

L(medium_31t):
	stbu	rCHR, -1(rMEMP)
	bf-	30, L(medium_30f)
L(medium_30t):
	sthu	rCHR, -2(rMEMP)
	bf-	29, L(medium_29f)
L(medium_29t):
	stwu	rCHR, -4(rMEMP)
	blt	cr1, L(medium_27f)
L(medium_27t):
        stw     rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
        stw     rCHR, -12(rMEMP)
	stwu	rCHR, -16(rMEMP)
L(medium_27f):
	bflr	28
L(medium_28t):
        stw     rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	blr
END (memset)
libc_hidden_builtin_def (memset)
Commit	Line	Data
a88f47a7	1	/* Optimized 32-bit memset implementation for POWER6.
04277e02	2	Copyright (C) 1997-2019 Free Software Foundation, Inc.
04067002 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
04067002 UD	18
04067002 UD	19	#include <sysdep.h>
04067002	20
f17a4233	21	/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
04067002 UD	22	Returns 's'.
	23
	24	The memset is done in three sizes: byte (8 bits), word (32 bits),
	25	cache line (1024 bits). There is a special case for setting cache lines
	26	to 0, to take advantage of the dcbz instruction. */
	27
a88f47a7	28	.machine power6
b5510883	29	EALIGN (memset, 7, 0)
04067002 UD	30	CALL_MCOUNT
	31
	32	#define rTMP r0
	33	#define rRTN r3 /* Initial value of 1st argument. */
	34	#define rMEMP0 r3 /* Original value of 1st arg. */
	35	#define rCHR r4 /* Char to set in each byte. */
	36	#define rLEN r5 /* Length of region to set. */
	37	#define rMEMP r6 /* Address at which we are storing. */
	38	#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
	39	#define rMEMP2 r8
	40
	41	#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
a88f47a7	42	#define rMEMP3 r9 /* Alt mem pointer. */
04067002 UD	43	L(_memset):
	44	/* Take care of case for size <= 4. */
	45	cmplwi cr1, rLEN, 4
	46	andi. rALIGN, rMEMP0, 3
	47	mr rMEMP, rMEMP0
	48	ble- cr1, L(small)
04067002 UD	49	/* Align to word boundary. */
04067002 UD	50	cmplwi cr5, rLEN, 31
d298c416	51	insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */
04067002 UD	52	beq+ L(aligned)
	53	mtcrf 0x01, rMEMP0
	54	subfic rALIGN, rALIGN, 4
	55	add rMEMP, rMEMP, rALIGN
	56	sub rLEN, rLEN, rALIGN
	57	bf+ 31, L(g0)
	58	stb rCHR, 0(rMEMP0)
	59	bt 30, L(aligned)
	60	L(g0):
	61	sth rCHR, -2(rMEMP)
	62
	63	.align 4
	64	/* Handle the case of size < 31. */
	65	L(aligned):
	66	mtcrf 0x01, rLEN
d298c416	67	insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */
04067002 UD	68	ble cr5, L(medium)
	69	/* Align to 32-byte boundary. */
	70	andi. rALIGN, rMEMP, 0x1C
	71	subfic rALIGN, rALIGN, 0x20
	72	beq L(caligned)
	73	mtcrf 0x01, rALIGN
	74	add rMEMP, rMEMP, rALIGN
	75	sub rLEN, rLEN, rALIGN
	76	cmplwi cr1, rALIGN, 0x10
	77	mr rMEMP2, rMEMP
	78	bf 28, L(a1)
	79	stw rCHR, -4(rMEMP2)
	80	stwu rCHR, -8(rMEMP2)
a88f47a7	81	nop
04067002 UD	82	L(a1): blt cr1, L(a2)
	83	stw rCHR, -4(rMEMP2)
	84	stw rCHR, -8(rMEMP2)
	85	stw rCHR, -12(rMEMP2)
	86	stwu rCHR, -16(rMEMP2)
	87	L(a2): bf 29, L(caligned)
	88	stw rCHR, -4(rMEMP2)
	89
a88f47a7	90	.align 3
04067002 UD	91	/* Now aligned to a 32 byte boundary. */
	92	L(caligned):
	93	cmplwi cr1, rCHR, 0
	94	clrrwi. rALIGN, rLEN, 5
	95	mtcrf 0x01, rLEN
	96	beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
	97	L(nondcbz):
04067002	98	beq L(medium) /* We may not actually get to do a full line. */
a88f47a7 UD	99	nop
	100	/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
	101	boundary may not be at cache line (128-byte) boundary. */
	102	L(nzloopstart):
	103	/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08	104	If rLEN is less than the distance to the next cache-line boundary use
a88f47a7 UD	105	cacheAligned1 code to finish the tail. */
	106	cmplwi cr1,rLEN,128
	107
	108	andi. rTMP,rMEMP,127
	109	blt cr1,L(cacheAligned1)
	110	addi rMEMP3,rMEMP,32
	111	beq L(nzCacheAligned)
	112	addi rLEN,rLEN,-32
	113	stw rCHR,0(rMEMP)
	114	stw rCHR,4(rMEMP)
	115	stw rCHR,8(rMEMP)
	116	stw rCHR,12(rMEMP)
	117	stw rCHR,16(rMEMP)
	118	stw rCHR,20(rMEMP)
	119	addi rMEMP,rMEMP,32
	120	andi. rTMP,rMEMP3,127
	121	stw rCHR,-8(rMEMP3)
	122	stw rCHR,-4(rMEMP3)
04067002	123
a88f47a7 UD	124	beq L(nzCacheAligned)
	125	addi rLEN,rLEN,-32
	126	stw rCHR,0(rMEMP3)
	127	stw rCHR,4(rMEMP3)
	128	addi rMEMP,rMEMP,32
	129	stw rCHR,8(rMEMP3)
	130	stw rCHR,12(rMEMP3)
	131	andi. rTMP,rMEMP,127
	132	stw rCHR,16(rMEMP3)
	133	stw rCHR,20(rMEMP3)
	134	stw rCHR,24(rMEMP3)
	135	stw rCHR,28(rMEMP3)
	136
	137	beq L(nzCacheAligned)
	138	addi rLEN,rLEN,-32
	139	/* At this point we can overrun the store queue (pipe reject) so it is
	140	time to slow things down. The store queue can merge two adjacent
	141	stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
	142	So we add "group ending nops" to guarantee that we dispatch only two
	143	stores every other cycle. */
	144	ori r1,r1,0
	145	ori r1,r1,0
	146	stw rCHR,32(rMEMP3)
	147	stw rCHR,36(rMEMP3)
	148	addi rMEMP,rMEMP,32
	149	cmplwi cr1,rLEN,128
	150	ori r1,r1,0
	151	ori r1,r1,0
	152	stw rCHR,40(rMEMP3)
	153	stw rCHR,44(rMEMP3)
	154	ori r1,r1,0
	155	ori r1,r1,0
	156	stw rCHR,48(rMEMP3)
	157	stw rCHR,52(rMEMP3)
	158	ori r1,r1,0
	159	ori r1,r1,0
	160	stw rCHR,56(rMEMP3)
	161	stw rCHR,60(rMEMP3)
	162	blt cr1,L(cacheAligned1)
	163	b L(nzCacheAligned)
	164
	165	/* Now we are aligned to the cache line and can use dcbtst. */
	166	.align 5
	167	L(nzCacheAligned):
	168	cmplwi cr1,rLEN,128
	169	cmplwi cr6,rLEN,256
	170	blt cr1,L(cacheAligned1)
	171	blt cr6,L(nzCacheAligned128)
04067002	172	.align 4
a88f47a7 UD	173	L(nzCacheAligned128):
	174	nop
	175	addi rMEMP3,rMEMP,64
	176	stw rCHR,0(rMEMP)
	177	stw rCHR,4(rMEMP)
	178	stw rCHR,8(rMEMP)
	179	stw rCHR,12(rMEMP)
	180	stw rCHR,16(rMEMP)
	181	stw rCHR,20(rMEMP)
	182	stw rCHR,24(rMEMP)
	183	stw rCHR,28(rMEMP)
	184	stw rCHR,32(rMEMP)
	185	stw rCHR,36(rMEMP)
	186	stw rCHR,40(rMEMP)
	187	stw rCHR,44(rMEMP)
	188	stw rCHR,48(rMEMP)
	189	stw rCHR,52(rMEMP)
	190	stw rCHR,56(rMEMP)
	191	stw rCHR,60(rMEMP)
	192	addi rMEMP,rMEMP3,64
	193	addi rLEN,rLEN,-128
	194	/* At this point we can overrun the store queue (pipe reject) so it is
	195	time to slow things down. The store queue can merge two adjacent
	196	stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
	197	So we add "group ending nops" to guarantee that we dispatch only one
	198	store per cycle. */
	199	stw rCHR,0(rMEMP3)
	200	ori r1,r1,0
	201	stw rCHR,4(rMEMP3)
	202	ori r1,r1,0
	203	stw rCHR,8(rMEMP3)
	204	ori r1,r1,0
	205	stw rCHR,12(rMEMP3)
	206	ori r1,r1,0
	207	stw rCHR,16(rMEMP3)
	208	ori r1,r1,0
	209	stw rCHR,20(rMEMP3)
	210	ori r1,r1,0
	211	stw rCHR,24(rMEMP3)
	212	ori r1,r1,0
	213	stw rCHR,28(rMEMP3)
	214	ori r1,r1,0
	215	stw rCHR,32(rMEMP3)
	216	ori r1,r1,0
	217	stw rCHR,36(rMEMP3)
	218	ori r1,r1,0
	219	stw rCHR,40(rMEMP3)
	220	ori r1,r1,0
	221	stw rCHR,44(rMEMP3)
	222	ori r1,r1,0
	223	stw rCHR,48(rMEMP3)
	224	ori r1,r1,0
	225	stw rCHR,52(rMEMP3)
	226	ori r1,r1,0
	227	stw rCHR,56(rMEMP3)
	228	ori r1,r1,0
	229	stw rCHR,60(rMEMP3)
	230	blt cr6,L(cacheAligned1)
4f41c682	231	#if IS_IN (libc)
a88f47a7 UD	232	lfd 0,-128(rMEMP)
	233	#endif
	234	b L(nzCacheAligned256)
	235	.align 5
	236	L(nzCacheAligned256):
	237	cmplwi cr1,rLEN,256
	238	addi rMEMP3,rMEMP,64
4f41c682	239	#if !IS_IN (libc)
25bfbb9e	240	/* When we are not in libc we should use only GPRs to avoid the FPU lock
a88f47a7 UD	241	interrupt. */
	242	stw rCHR,0(rMEMP)
	243	stw rCHR,4(rMEMP)
	244	stw rCHR,8(rMEMP)
	245	stw rCHR,12(rMEMP)
	246	stw rCHR,16(rMEMP)
	247	stw rCHR,20(rMEMP)
	248	stw rCHR,24(rMEMP)
	249	stw rCHR,28(rMEMP)
	250	stw rCHR,32(rMEMP)
	251	stw rCHR,36(rMEMP)
	252	stw rCHR,40(rMEMP)
	253	stw rCHR,44(rMEMP)
	254	stw rCHR,48(rMEMP)
	255	stw rCHR,52(rMEMP)
	256	stw rCHR,56(rMEMP)
	257	stw rCHR,60(rMEMP)
	258	addi rMEMP,rMEMP3,64
	259	addi rLEN,rLEN,-128
	260	stw rCHR,0(rMEMP3)
	261	stw rCHR,4(rMEMP3)
	262	stw rCHR,8(rMEMP3)
	263	stw rCHR,12(rMEMP3)
	264	stw rCHR,16(rMEMP3)
	265	stw rCHR,20(rMEMP3)
	266	stw rCHR,24(rMEMP3)
	267	stw rCHR,28(rMEMP3)
	268	stw rCHR,32(rMEMP3)
	269	stw rCHR,36(rMEMP3)
	270	stw rCHR,40(rMEMP3)
	271	stw rCHR,44(rMEMP3)
	272	stw rCHR,48(rMEMP3)
	273	stw rCHR,52(rMEMP3)
	274	stw rCHR,56(rMEMP3)
	275	stw rCHR,60(rMEMP3)
	276	#else
	277	/* We are in libc and this is a long memset so we can use FPRs and can afford
	278	occasional FPU locked interrupts. */
	279	stfd 0,0(rMEMP)
	280	stfd 0,8(rMEMP)
	281	stfd 0,16(rMEMP)
	282	stfd 0,24(rMEMP)
	283	stfd 0,32(rMEMP)
	284	stfd 0,40(rMEMP)
	285	stfd 0,48(rMEMP)
	286	stfd 0,56(rMEMP)
	287	addi rMEMP,rMEMP3,64
	288	addi rLEN,rLEN,-128
	289	stfd 0,0(rMEMP3)
	290	stfd 0,8(rMEMP3)
	291	stfd 0,16(rMEMP3)
	292	stfd 0,24(rMEMP3)
	293	stfd 0,32(rMEMP3)
	294	stfd 0,40(rMEMP3)
	295	stfd 0,48(rMEMP3)
	296	stfd 0,56(rMEMP3)
	297	#endif
	298	bge cr1,L(nzCacheAligned256)
	299	dcbtst 0,rMEMP
	300	b L(cacheAligned1)
04067002	301
a88f47a7 UD	302	.align 4
	303	/* Storing a zero "c" value. We are aligned at a sector (32-byte)
	304	boundary but may not be at cache line (128-byte) boundary. If the
	305	remaining length spans a full cache line we can use the Data cache
	306	block zero instruction. */
04067002	307	L(zloopstart):
a88f47a7	308	/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08	309	If rLEN is less than the distance to the next cache-line boundary use
a88f47a7 UD	310	cacheAligned1 code to finish the tail. */
a88f47a7 UD	311	cmplwi cr1,rLEN,128
04067002	312	beq L(medium)
04067002	313	L(getCacheAligned):
04067002	314	andi. rTMP,rMEMP,127
a88f47a7 UD	315	blt cr1,L(cacheAligned1)
	316	addi rMEMP3,rMEMP,32
	317	beq L(cacheAligned)
	318	addi rLEN,rLEN,-32
	319	stw rCHR,0(rMEMP)
	320	stw rCHR,4(rMEMP)
	321	stw rCHR,8(rMEMP)
	322	stw rCHR,12(rMEMP)
	323	stw rCHR,16(rMEMP)
	324	stw rCHR,20(rMEMP)
	325	addi rMEMP,rMEMP,32
	326	andi. rTMP,rMEMP3,127
	327	stw rCHR,-8(rMEMP3)
	328	stw rCHR,-4(rMEMP3)
	329	L(getCacheAligned2):
04067002	330	beq L(cacheAligned)
a88f47a7	331	addi rLEN,rLEN,-32
04067002	332	addi rMEMP,rMEMP,32
a88f47a7 UD	333	stw rCHR,0(rMEMP3)
	334	stw rCHR,4(rMEMP3)
	335	stw rCHR,8(rMEMP3)
	336	stw rCHR,12(rMEMP3)
	337	andi. rTMP,rMEMP,127
	338	nop
	339	stw rCHR,16(rMEMP3)
	340	stw rCHR,20(rMEMP3)
	341	stw rCHR,24(rMEMP3)
	342	stw rCHR,28(rMEMP3)
	343	L(getCacheAligned3):
	344	beq L(cacheAligned)
	345	/* At this point we can overrun the store queue (pipe reject) so it is
	346	time to slow things down. The store queue can merge two adjacent
	347	stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
	348	So we add "group ending nops" to guarantee that we dispatch only two
	349	stores every other cycle. */
04067002	350	addi rLEN,rLEN,-32
a88f47a7 UD	351	ori r1,r1,0
	352	ori r1,r1,0
	353	stw rCHR,32(rMEMP3)
	354	stw rCHR,36(rMEMP3)
	355	addi rMEMP,rMEMP,32
	356	cmplwi cr1,rLEN,128
	357	ori r1,r1,0
	358	stw rCHR,40(rMEMP3)
	359	stw rCHR,44(rMEMP3)
	360	cmplwi cr6,rLEN,256
	361	li rMEMP2,128
	362	ori r1,r1,0
	363	stw rCHR,48(rMEMP3)
	364	stw rCHR,52(rMEMP3)
	365	ori r1,r1,0
	366	ori r1,r1,0
	367	stw rCHR,56(rMEMP3)
	368	stw rCHR,60(rMEMP3)
	369	blt cr1,L(cacheAligned1)
	370	blt cr6,L(cacheAligned128)
	371	b L(cacheAlignedx)
04067002 UD	372
	373	/* Now we are aligned to the cache line and can use dcbz. */
	374	.align 4
	375	L(cacheAligned):
a88f47a7 UD	376	cmplwi cr1,rLEN,128
	377	cmplwi cr6,rLEN,256
	378	blt cr1,L(cacheAligned1)
	379	li rMEMP2,128
	380	L(cacheAlignedx):
25bfbb9e	381	cmplwi cr5,rLEN,640
a88f47a7 UD	382	blt cr6,L(cacheAligned128)
	383	bgt cr5,L(cacheAligned512)
	384	cmplwi cr6,rLEN,512
04067002	385	dcbz 0,rMEMP
a88f47a7 UD	386	cmplwi cr1,rLEN,384
	387	dcbz rMEMP2,rMEMP
	388	addi rMEMP,rMEMP,256
	389	addi rLEN,rLEN,-256
	390	blt cr1,L(cacheAligned1)
	391	blt cr6,L(cacheAligned128)
	392	b L(cacheAligned256)
	393	.align 5
	394	/* A simple loop for the longer (>640 bytes) lengths. This form limits
	395	the branch miss-predicted to exactly 1 at loop exit.*/
	396	L(cacheAligned512):
78b7adba	397	cmplwi cr1,rLEN,128
a88f47a7 UD	398	blt cr1,L(cacheAligned1)
	399	dcbz 0,rMEMP
	400	addi rLEN,rLEN,-128
	401	addi rMEMP,rMEMP,128
	402	b L(cacheAligned512)
	403	.align 5
	404	L(cacheAligned256):
	405	cmplwi cr6,rLEN,512
	406	dcbz 0,rMEMP
	407	cmplwi cr1,rLEN,384
	408	dcbz rMEMP2,rMEMP
	409	addi rMEMP,rMEMP,256
	410	addi rLEN,rLEN,-256
	411	bge cr6,L(cacheAligned256)
	412	blt cr1,L(cacheAligned1)
	413	.align 4
	414	L(cacheAligned128):
	415	dcbz 0,rMEMP
	416	addi rMEMP,rMEMP,128
	417	addi rLEN,rLEN,-128
	418	.align 4
	419	L(cacheAligned1):
	420	cmplwi cr1,rLEN,32
	421	blt cr1,L(handletail32)
	422	addi rMEMP3,rMEMP,32
	423	addi rLEN,rLEN,-32
	424	stw rCHR,0(rMEMP)
	425	stw rCHR,4(rMEMP)
	426	stw rCHR,8(rMEMP)
	427	stw rCHR,12(rMEMP)
	428	stw rCHR,16(rMEMP)
	429	stw rCHR,20(rMEMP)
	430	addi rMEMP,rMEMP,32
	431	cmplwi cr1,rLEN,32
	432	stw rCHR,-8(rMEMP3)
	433	stw rCHR,-4(rMEMP3)
	434	L(cacheAligned2):
	435	blt cr1,L(handletail32)
	436	addi rLEN,rLEN,-32
	437	stw rCHR,0(rMEMP3)
	438	stw rCHR,4(rMEMP3)
	439	stw rCHR,8(rMEMP3)
	440	stw rCHR,12(rMEMP3)
	441	addi rMEMP,rMEMP,32
	442	cmplwi cr1,rLEN,32
	443	stw rCHR,16(rMEMP3)
	444	stw rCHR,20(rMEMP3)
	445	stw rCHR,24(rMEMP3)
	446	stw rCHR,28(rMEMP3)
	447	nop
	448	L(cacheAligned3):
	449	blt cr1,L(handletail32)
	450	/* At this point we can overrun the store queue (pipe reject) so it is
	451	time to slow things down. The store queue can merge two adjacent
	452	stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
	453	So we add "group ending nops" to guarantee that we dispatch only two
	454	stores every other cycle. */
	455	ori r1,r1,0
	456	ori r1,r1,0
	457	addi rMEMP,rMEMP,32
	458	addi rLEN,rLEN,-32
	459	ori r1,r1,0
	460	ori r1,r1,0
	461	stw rCHR,32(rMEMP3)
462	stw rCHR,36(rMEMP3)
463	ori r1,r1,0
464	ori r1,r1,0
465	stw rCHR,40(rMEMP3)
466	stw rCHR,44(rMEMP3)
467	ori r1,r1,0
468	ori r1,r1,0
469	stw rCHR,48(rMEMP3)
470	stw rCHR,52(rMEMP3)
471	ori r1,r1,0
472	ori r1,r1,0
473	stw rCHR,56(rMEMP3)
474	stw rCHR,60(rMEMP3)
04067002	475
a88f47a7 UD	476	/* We are here because the length or remainder (rLEN) is less than the
	477	cache line/sector size and does not justify aggressive loop unrolling.
	478	So set up the preconditions for L(medium) and go there. */
04067002 UD	479	.align 3
04067002 UD	480	L(handletail32):
a88f47a7 UD	481	cmplwi cr1,rLEN,0
	482	beqlr cr1
	483	b L(medium)
04067002	484
a88f47a7	485	.align 4
04067002 UD	486	L(small):
	487	/* Memset of 4 bytes or less. */
	488	cmplwi cr5, rLEN, 1
	489	cmplwi cr1, rLEN, 3
	490	bltlr cr5
	491	stb rCHR, 0(rMEMP)
	492	beqlr cr5
	493	stb rCHR, 1(rMEMP)
	494	bltlr cr1
	495	stb rCHR, 2(rMEMP)
	496	beqlr cr1
	497	stb rCHR, 3(rMEMP)
	498	blr
	499
	500	/* Memset of 0-31 bytes. */
	501	.align 5
	502	L(medium):
	503	cmplwi cr1, rLEN, 16
	504	L(medium_tail2):
	505	add rMEMP, rMEMP, rLEN
	506	L(medium_tail):
	507	bt- 31, L(medium_31t)
	508	bt- 30, L(medium_30t)
	509	L(medium_30f):
a88f47a7	510	bt 29, L(medium_29t)
04067002	511	L(medium_29f):
a88f47a7 UD	512	bge cr1, L(medium_27t)
a88f47a7 UD	513	bflr 28
04067002 UD	514	stw rCHR, -4(rMEMP)
	515	stw rCHR, -8(rMEMP)
	516	blr
	517
	518	L(medium_31t):
	519	stbu rCHR, -1(rMEMP)
	520	bf- 30, L(medium_30f)
	521	L(medium_30t):
	522	sthu rCHR, -2(rMEMP)
	523	bf- 29, L(medium_29f)
	524	L(medium_29t):
	525	stwu rCHR, -4(rMEMP)
a88f47a7	526	blt cr1, L(medium_27f)
04067002 UD	527	L(medium_27t):
	528	stw rCHR, -4(rMEMP)
	529	stw rCHR, -8(rMEMP)
	530	stw rCHR, -12(rMEMP)
	531	stwu rCHR, -16(rMEMP)
	532	L(medium_27f):
a88f47a7	533	bflr 28
04067002 UD	534	L(medium_28t):
	535	stw rCHR, -4(rMEMP)
	536	stw rCHR, -8(rMEMP)
	537	blr
b5510883	538	END (memset)
04067002	539	libc_hidden_builtin_def (memset)