]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power4/memset.S
[BZ #4775, BZ #4776]
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memset.S
CommitLineData
04067002
UD
1/* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997, 1999, 2000, 2002, 2003, 2007
3 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21#include <sysdep.h>
22#include <bp-sym.h>
23#include <bp-asm.h>
24
25 .section ".toc","aw"
26.LC0:
27 .tc __cache_line_size[TC],__cache_line_size
28 .section ".text"
29 .align 2
30
31/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
32 Returns 's'.
33
34 The memset is done in three sizes: byte (8 bits), word (32 bits),
35 cache line (256 bits). There is a special case for setting cache lines
36 to 0, to take advantage of the dcbz instruction. */
37
38EALIGN (BP_SYM (memset), 5, 0)
39 CALL_MCOUNT 3
40
41#define rTMP r0
42#define rRTN r3 /* Initial value of 1st argument. */
43#if __BOUNDED_POINTERS__
44# define rMEMP0 r4 /* Original value of 1st arg. */
45# define rCHR r5 /* Char to set in each byte. */
46# define rLEN r6 /* Length of region to set. */
47# define rMEMP r10 /* Address at which we are storing. */
48#else
49# define rMEMP0 r3 /* Original value of 1st arg. */
50# define rCHR r4 /* Char to set in each byte. */
51# define rLEN r5 /* Length of region to set. */
52# define rMEMP r6 /* Address at which we are storing. */
53#endif
54#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
55#define rMEMP2 r8
56
57#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
58#define rCLS r8 /* Cache line size obtained from static. */
59#define rCLM r9 /* Cache line size mask to check for cache alignment. */
60L(_memset):
61#if __BOUNDED_POINTERS__
62 cmpldi cr1, rRTN, 0
63 CHECK_BOUNDS_BOTH_WIDE (rMEMP0, rTMP, rTMP2, rLEN)
64 beq cr1, L(b0)
65 STORE_RETURN_VALUE (rMEMP0)
66 STORE_RETURN_BOUNDS (rTMP, rTMP2)
67L(b0):
68#endif
69/* Take care of case for size <= 4. */
70 cmpldi cr1, rLEN, 8
71 andi. rALIGN, rMEMP0, 7
72 mr rMEMP, rMEMP0
73 ble- cr1, L(small)
74
75/* Align to doubleword boundary. */
76 cmpldi cr5, rLEN, 31
77 rlwimi rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword. */
78 beq+ L(aligned2)
79 mtcrf 0x01, rMEMP0
80 subfic rALIGN, rALIGN, 8
81 cror 28,30,31 /* Detect odd word aligned. */
82 add rMEMP, rMEMP, rALIGN
83 sub rLEN, rLEN, rALIGN
84 rlwimi rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word. */
85 bt 29, L(g4)
86/* Process the even word of doubleword. */
87 bf+ 31, L(g2)
88 stb rCHR, 0(rMEMP0)
89 bt 30, L(g4x)
90L(g2):
91 sth rCHR, -6(rMEMP)
92L(g4x):
93 stw rCHR, -4(rMEMP)
94 b L(aligned)
95/* Process the odd word of doubleword. */
96L(g4):
97 bf 28, L(g4x) /* If false, word aligned on odd word. */
98 bf+ 31, L(g0)
99 stb rCHR, 0(rMEMP0)
100 bt 30, L(aligned)
101L(g0):
102 sth rCHR, -2(rMEMP)
103
104/* Handle the case of size < 31. */
105L(aligned2):
106 rlwimi rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word. */
107L(aligned):
108 mtcrf 0x01, rLEN
109 ble cr5, L(medium)
110/* Align to 32-byte boundary. */
111 andi. rALIGN, rMEMP, 0x18
112 subfic rALIGN, rALIGN, 0x20
113 insrdi rCHR,rCHR,32,0 /* Replicate word to double word. */
114 beq L(caligned)
115 mtcrf 0x01, rALIGN
116 add rMEMP, rMEMP, rALIGN
117 sub rLEN, rLEN, rALIGN
118 cmplwi cr1, rALIGN, 0x10
119 mr rMEMP2, rMEMP
120 bf 28, L(a1)
121 stdu rCHR, -8(rMEMP2)
122L(a1): blt cr1, L(a2)
123 std rCHR, -8(rMEMP2)
124 stdu rCHR, -16(rMEMP2)
125L(a2):
126
127/* Now aligned to a 32 byte boundary. */
128L(caligned):
129 cmpldi cr1, rCHR, 0
130 clrrdi. rALIGN, rLEN, 5
131 mtcrf 0x01, rLEN
132 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
133L(nondcbz):
134 srdi rTMP, rALIGN, 5
135 mtctr rTMP
136 beq L(medium) /* We may not actually get to do a full line. */
137 clrldi. rLEN, rLEN, 59
138 add rMEMP, rMEMP, rALIGN
139 li rNEG64, -0x40
140 bdz L(cloopdone)
141
142L(c3): dcbtst rNEG64, rMEMP
143 std rCHR, -8(rMEMP)
144 std rCHR, -16(rMEMP)
145 std rCHR, -24(rMEMP)
146 stdu rCHR, -32(rMEMP)
147 bdnz L(c3)
148L(cloopdone):
149 std rCHR, -8(rMEMP)
150 std rCHR, -16(rMEMP)
151 cmpldi cr1, rLEN, 16
152 std rCHR, -24(rMEMP)
153 stdu rCHR, -32(rMEMP)
154 beqlr
155 add rMEMP, rMEMP, rALIGN
156 b L(medium_tail2)
157
158 .align 5
159/* Clear lines of memory in 128-byte chunks. */
160L(zloopstart):
161/* If the remaining length is less the 32 bytes, don't bother getting
162 the cache line size. */
163 beq L(medium)
164 li rCLS,128 /* cache line size is 128 */
165
166/* Now we know the cache line size, and it is not 32-bytes, but
167 we may not yet be aligned to the cache line. May have a partial
168 line to fill, so touch it 1st. */
169 dcbt 0,rMEMP
170L(getCacheAligned):
171 cmpldi cr1,rLEN,32
172 andi. rTMP,rMEMP,127
173 blt cr1,L(handletail32)
174 beq L(cacheAligned)
175 addi rMEMP,rMEMP,32
176 addi rLEN,rLEN,-32
177 std rCHR,-32(rMEMP)
178 std rCHR,-24(rMEMP)
179 std rCHR,-16(rMEMP)
180 std rCHR,-8(rMEMP)
181 b L(getCacheAligned)
182
183/* Now we are aligned to the cache line and can use dcbz. */
184L(cacheAligned):
185 cmpld cr1,rLEN,rCLS
186 blt cr1,L(handletail32)
187 dcbz 0,rMEMP
188 subf rLEN,rCLS,rLEN
189 add rMEMP,rMEMP,rCLS
190 b L(cacheAligned)
191
192/* We are here because the cache line size was set and was not 32-bytes
193 and the remainder (rLEN) is less than the actual cache line size.
194 So set up the preconditions for L(nondcbz) and go there. */
195L(handletail32):
196 clrrwi. rALIGN, rLEN, 5
197 b L(nondcbz)
198
199 .align 5
200L(small):
201/* Memset of 8 bytes or less. */
202 cmpldi cr6, rLEN, 4
203 cmpldi cr5, rLEN, 1
204 ble cr6,L(le4)
205 subi rLEN, rLEN, 4
206 stb rCHR,0(rMEMP)
207 stb rCHR,1(rMEMP)
208 stb rCHR,2(rMEMP)
209 stb rCHR,3(rMEMP)
210 addi rMEMP,rMEMP, 4
211 cmpldi cr5, rLEN, 1
212L(le4):
213 cmpldi cr1, rLEN, 3
214 bltlr cr5
215 stb rCHR, 0(rMEMP)
216 beqlr cr5
217 stb rCHR, 1(rMEMP)
218 bltlr cr1
219 stb rCHR, 2(rMEMP)
220 beqlr cr1
221 stb rCHR, 3(rMEMP)
222 blr
223
224/* Memset of 0-31 bytes. */
225 .align 5
226L(medium):
227 insrdi rCHR,rCHR,32,0 /* Replicate word to double word. */
228 cmpldi cr1, rLEN, 16
229L(medium_tail2):
230 add rMEMP, rMEMP, rLEN
231L(medium_tail):
232 bt- 31, L(medium_31t)
233 bt- 30, L(medium_30t)
234L(medium_30f):
235 bt- 29, L(medium_29t)
236L(medium_29f):
237 bge- cr1, L(medium_27t)
238 bflr- 28
239 std rCHR, -8(rMEMP)
240 blr
241
242L(medium_31t):
243 stbu rCHR, -1(rMEMP)
244 bf- 30, L(medium_30f)
245L(medium_30t):
246 sthu rCHR, -2(rMEMP)
247 bf- 29, L(medium_29f)
248L(medium_29t):
249 stwu rCHR, -4(rMEMP)
250 blt- cr1, L(medium_27f)
251L(medium_27t):
252 std rCHR, -8(rMEMP)
253 stdu rCHR, -16(rMEMP)
254L(medium_27f):
255 bflr- 28
256L(medium_28t):
257 std rCHR, -8(rMEMP)
258 blr
259END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
260libc_hidden_builtin_def (memset)
261
262/* Copied from bzero.S to prevent the linker from inserting a stub
263 between bzero and memset. */
264ENTRY (BP_SYM (__bzero))
265 CALL_MCOUNT 3
266#if __BOUNDED_POINTERS__
267 mr r6,r4
268 li r5,0
269 mr r4,r3
270 /* Tell memset that we don't want a return value. */
271 li r3,0
272 b L(_memset)
273#else
274 mr r5,r4
275 li r4,0
276 b L(_memset)
277#endif
278END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
279
280weak_alias (BP_SYM (__bzero), BP_SYM (bzero))