]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power6/memset.S
[BZ #4775, BZ #4776]
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power6 / memset.S
CommitLineData
04067002
UD
1/* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997,99, 2000,02,03, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
18 02110-1301 USA. */
19
20#include <sysdep.h>
21#include <bp-sym.h>
22#include <bp-asm.h>
23
24/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
25 Returns 's'.
26
27 The memset is done in three sizes: byte (8 bits), word (32 bits),
28 cache line (1024 bits). There is a special case for setting cache lines
29 to 0, to take advantage of the dcbz instruction. */
30
31EALIGN (BP_SYM (memset), 5, 0)
32 CALL_MCOUNT
33
34#define rTMP r0
35#define rRTN r3 /* Initial value of 1st argument. */
36#define rMEMP0 r3 /* Original value of 1st arg. */
37#define rCHR r4 /* Char to set in each byte. */
38#define rLEN r5 /* Length of region to set. */
39#define rMEMP r6 /* Address at which we are storing. */
40#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
41#define rMEMP2 r8
42
43#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
44#define rCLS r8 /* Cache line size (known to be 128). */
45#define rCLM r9 /* Cache line size mask to check for cache alignment. */
46L(_memset):
47/* Take care of case for size <= 4. */
48 cmplwi cr1, rLEN, 4
49 andi. rALIGN, rMEMP0, 3
50 mr rMEMP, rMEMP0
51 ble- cr1, L(small)
52
53/* Align to word boundary. */
54 cmplwi cr5, rLEN, 31
55 rlwimi rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword. */
56 beq+ L(aligned)
57 mtcrf 0x01, rMEMP0
58 subfic rALIGN, rALIGN, 4
59 add rMEMP, rMEMP, rALIGN
60 sub rLEN, rLEN, rALIGN
61 bf+ 31, L(g0)
62 stb rCHR, 0(rMEMP0)
63 bt 30, L(aligned)
64L(g0):
65 sth rCHR, -2(rMEMP)
66
67 .align 4
68/* Handle the case of size < 31. */
69L(aligned):
70 mtcrf 0x01, rLEN
71 rlwimi rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word. */
72 ble cr5, L(medium)
73/* Align to 32-byte boundary. */
74 andi. rALIGN, rMEMP, 0x1C
75 subfic rALIGN, rALIGN, 0x20
76 beq L(caligned)
77 mtcrf 0x01, rALIGN
78 add rMEMP, rMEMP, rALIGN
79 sub rLEN, rLEN, rALIGN
80 cmplwi cr1, rALIGN, 0x10
81 mr rMEMP2, rMEMP
82 bf 28, L(a1)
83 stw rCHR, -4(rMEMP2)
84 stwu rCHR, -8(rMEMP2)
85L(a1): blt cr1, L(a2)
86 stw rCHR, -4(rMEMP2)
87 stw rCHR, -8(rMEMP2)
88 stw rCHR, -12(rMEMP2)
89 stwu rCHR, -16(rMEMP2)
90L(a2): bf 29, L(caligned)
91 stw rCHR, -4(rMEMP2)
92
93 .align 4
94/* Now aligned to a 32 byte boundary. */
95L(caligned):
96 cmplwi cr1, rCHR, 0
97 clrrwi. rALIGN, rLEN, 5
98 mtcrf 0x01, rLEN
99 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
100L(nondcbz):
101 srwi rTMP, rALIGN, 5
102 mtctr rTMP
103 beq L(medium) /* We may not actually get to do a full line. */
104 clrlwi. rLEN, rLEN, 27
105 add rMEMP, rMEMP, rALIGN
106 li rNEG64, -0x40
107 bdz L(cloopdone)
108
109 .align 4
110L(c3): dcbtst rNEG64, rMEMP
111 stw rCHR, -4(rMEMP)
112 stw rCHR, -8(rMEMP)
113 stw rCHR, -12(rMEMP)
114 stw rCHR, -16(rMEMP)
115 stw rCHR, -20(rMEMP)
116 stw rCHR, -24(rMEMP)
117 stw rCHR, -28(rMEMP)
118 stwu rCHR, -32(rMEMP)
119 bdnz L(c3)
120L(cloopdone):
121 stw rCHR, -4(rMEMP)
122 stw rCHR, -8(rMEMP)
123 stw rCHR, -12(rMEMP)
124 stw rCHR, -16(rMEMP)
125 cmplwi cr1, rLEN, 16
126 stw rCHR, -20(rMEMP)
127 stw rCHR, -24(rMEMP)
128 stw rCHR, -28(rMEMP)
129 stwu rCHR, -32(rMEMP)
130 beqlr
131 add rMEMP, rMEMP, rALIGN
132 b L(medium_tail2)
133
134 .align 5
135/* Clear lines of memory in 128-byte chunks. */
136L(zloopstart):
137/* If the remaining length is less the 32 bytes, don't bother getting
138 the cache line size. */
139 beq L(medium)
140 li rCLS,128 /* cache line size is 128 */
141 dcbt 0,rMEMP
142L(getCacheAligned):
143 cmplwi cr1,rLEN,32
144 andi. rTMP,rMEMP,127
145 blt cr1,L(handletail32)
146 beq L(cacheAligned)
147 addi rMEMP,rMEMP,32
148 addi rLEN,rLEN,-32
149 stw rCHR,-32(rMEMP)
150 stw rCHR,-28(rMEMP)
151 stw rCHR,-24(rMEMP)
152 stw rCHR,-20(rMEMP)
153 stw rCHR,-16(rMEMP)
154 stw rCHR,-12(rMEMP)
155 stw rCHR,-8(rMEMP)
156 stw rCHR,-4(rMEMP)
157 b L(getCacheAligned)
158
159/* Now we are aligned to the cache line and can use dcbz. */
160 .align 4
161L(cacheAligned):
162 cmplw cr1,rLEN,rCLS
163 blt cr1,L(handletail32)
164 dcbz 0,rMEMP
165 subf rLEN,rCLS,rLEN
166 add rMEMP,rMEMP,rCLS
167 b L(cacheAligned)
168
169/* We are here because the cache line size was set and the remainder
170 (rLEN) is less than the actual cache line size.
171 So set up the preconditions for L(nondcbz) and go there. */
172 .align 3
173L(handletail32):
174 clrrwi. rALIGN, rLEN, 5
175 b L(nondcbz)
176
177 .align 5
178L(small):
179/* Memset of 4 bytes or less. */
180 cmplwi cr5, rLEN, 1
181 cmplwi cr1, rLEN, 3
182 bltlr cr5
183 stb rCHR, 0(rMEMP)
184 beqlr cr5
185 stb rCHR, 1(rMEMP)
186 bltlr cr1
187 stb rCHR, 2(rMEMP)
188 beqlr cr1
189 stb rCHR, 3(rMEMP)
190 blr
191
192/* Memset of 0-31 bytes. */
193 .align 5
194L(medium):
195 cmplwi cr1, rLEN, 16
196L(medium_tail2):
197 add rMEMP, rMEMP, rLEN
198L(medium_tail):
199 bt- 31, L(medium_31t)
200 bt- 30, L(medium_30t)
201L(medium_30f):
202 bt- 29, L(medium_29t)
203L(medium_29f):
204 bge- cr1, L(medium_27t)
205 bflr- 28
206 stw rCHR, -4(rMEMP)
207 stw rCHR, -8(rMEMP)
208 blr
209
210L(medium_31t):
211 stbu rCHR, -1(rMEMP)
212 bf- 30, L(medium_30f)
213L(medium_30t):
214 sthu rCHR, -2(rMEMP)
215 bf- 29, L(medium_29f)
216L(medium_29t):
217 stwu rCHR, -4(rMEMP)
218 blt- cr1, L(medium_27f)
219L(medium_27t):
220 stw rCHR, -4(rMEMP)
221 stw rCHR, -8(rMEMP)
222 stw rCHR, -12(rMEMP)
223 stwu rCHR, -16(rMEMP)
224L(medium_27f):
225 bflr- 28
226L(medium_28t):
227 stw rCHR, -4(rMEMP)
228 stw rCHR, -8(rMEMP)
229 blr
230END (BP_SYM (memset))
231libc_hidden_builtin_def (memset)