]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power6/memset.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memset.S
1 /* Optimized 64-bit memset implementation for POWER6.
2 Copyright (C) 1997-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
22 Returns 's'.
23
24 The memset is done in three sizes: byte (8 bits), word (32 bits),
25 cache line (256 bits). There is a special case for setting cache lines
26 to 0, to take advantage of the dcbz instruction. */
27
28 .machine power6
29 EALIGN (memset, 7, 0)
30 CALL_MCOUNT 3
31
32 #define rTMP r0
33 #define rRTN r3 /* Initial value of 1st argument. */
34 #define rMEMP0 r3 /* Original value of 1st arg. */
35 #define rCHR r4 /* Char to set in each byte. */
36 #define rLEN r5 /* Length of region to set. */
37 #define rMEMP r6 /* Address at which we are storing. */
38 #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
39 #define rMEMP2 r8
40 #define rMEMP3 r9 /* Alt mem pointer. */
41 L(_memset):
42 /* Take care of case for size <= 4. */
43 cmpldi cr1, rLEN, 8
44 andi. rALIGN, rMEMP0, 7
45 mr rMEMP, rMEMP0
46 ble cr1, L(small)
47
48 /* Align to doubleword boundary. */
49 cmpldi cr5, rLEN, 31
50 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
51 beq+ L(aligned2)
52 mtcrf 0x01, rMEMP0
53 subfic rALIGN, rALIGN, 8
54 cror 28,30,31 /* Detect odd word aligned. */
55 add rMEMP, rMEMP, rALIGN
56 sub rLEN, rLEN, rALIGN
57 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
58 bt 29, L(g4)
59 /* Process the even word of doubleword. */
60 bf+ 31, L(g2)
61 stb rCHR, 0(rMEMP0)
62 bt 30, L(g4x)
63 L(g2):
64 sth rCHR, -6(rMEMP)
65 L(g4x):
66 stw rCHR, -4(rMEMP)
67 b L(aligned)
68 /* Process the odd word of doubleword. */
69 L(g4):
70 bf 28, L(g4x) /* If false, word aligned on odd word. */
71 bf+ 31, L(g0)
72 stb rCHR, 0(rMEMP0)
73 bt 30, L(aligned)
74 L(g0):
75 sth rCHR, -2(rMEMP)
76
77 /* Handle the case of size < 31. */
78 L(aligned2):
79 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
80 L(aligned):
81 mtcrf 0x01, rLEN
82 ble cr5, L(medium)
83 /* Align to 32-byte boundary. */
84 andi. rALIGN, rMEMP, 0x18
85 subfic rALIGN, rALIGN, 0x20
86 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
87 beq L(caligned)
88 mtcrf 0x01, rALIGN
89 add rMEMP, rMEMP, rALIGN
90 sub rLEN, rLEN, rALIGN
91 cmplwi cr1, rALIGN, 0x10
92 mr rMEMP2, rMEMP
93 bf 28, L(a1)
94 stdu rCHR, -8(rMEMP2)
95 L(a1): blt cr1, L(a2)
96 std rCHR, -8(rMEMP2)
97 stdu rCHR, -16(rMEMP2)
98 L(a2):
99
100 /* Now aligned to a 32 byte boundary. */
101 .align 4
102 L(caligned):
103 cmpldi cr1, rCHR, 0
104 clrrdi. rALIGN, rLEN, 5
105 mtcrf 0x01, rLEN
106 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
107 beq L(medium) /* We may not actually get to do a full line. */
108 .align 4
109 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
110 boundary may not be at cache line (128-byte) boundary. */
111 L(nzloopstart):
112 /* memset in 32-byte chunks until we get to a cache line boundary.
113 If rLEN is less than the distance to the next cache-line boundary use
114 cacheAligned1 code to finish the tail. */
115 cmpldi cr1,rLEN,128
116
117 andi. rTMP,rMEMP,127
118 blt cr1,L(cacheAligned1)
119 addi rMEMP3,rMEMP,32
120 beq L(nzCacheAligned)
121 addi rLEN,rLEN,-32
122 std rCHR,0(rMEMP)
123 std rCHR,8(rMEMP)
124 std rCHR,16(rMEMP)
125 addi rMEMP,rMEMP,32
126 andi. rTMP,rMEMP3,127
127 std rCHR,-8(rMEMP3)
128
129 beq L(nzCacheAligned)
130 addi rLEN,rLEN,-32
131 std rCHR,0(rMEMP3)
132 addi rMEMP,rMEMP,32
133 std rCHR,8(rMEMP3)
134 andi. rTMP,rMEMP,127
135 std rCHR,16(rMEMP3)
136 std rCHR,24(rMEMP3)
137
138 beq L(nzCacheAligned)
139 addi rLEN,rLEN,-32
140 std rCHR,32(rMEMP3)
141 addi rMEMP,rMEMP,32
142 cmpldi cr1,rLEN,128
143 std rCHR,40(rMEMP3)
144 cmpldi cr6,rLEN,256
145 li rMEMP2,128
146 std rCHR,48(rMEMP3)
147 std rCHR,56(rMEMP3)
148 blt cr1,L(cacheAligned1)
149 b L(nzCacheAligned128)
150
151 /* Now we are aligned to the cache line and can use dcbtst. */
152 .align 4
153 L(nzCacheAligned):
154 cmpldi cr1,rLEN,128
155 blt cr1,L(cacheAligned1)
156 b L(nzCacheAligned128)
157 .align 5
158 L(nzCacheAligned128):
159 cmpldi cr1,rLEN,256
160 addi rMEMP3,rMEMP,64
161 std rCHR,0(rMEMP)
162 std rCHR,8(rMEMP)
163 std rCHR,16(rMEMP)
164 std rCHR,24(rMEMP)
165 std rCHR,32(rMEMP)
166 std rCHR,40(rMEMP)
167 std rCHR,48(rMEMP)
168 std rCHR,56(rMEMP)
169 addi rMEMP,rMEMP3,64
170 addi rLEN,rLEN,-128
171 std rCHR,0(rMEMP3)
172 std rCHR,8(rMEMP3)
173 std rCHR,16(rMEMP3)
174 std rCHR,24(rMEMP3)
175 std rCHR,32(rMEMP3)
176 std rCHR,40(rMEMP3)
177 std rCHR,48(rMEMP3)
178 std rCHR,56(rMEMP3)
179 bge cr1,L(nzCacheAligned128)
180 dcbtst 0,rMEMP
181 b L(cacheAligned1)
182 .align 5
183 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
184 boundary but may not be at cache line (128-byte) boundary. If the
185 remaining length spans a full cache line we can use the Data cache
186 block zero instruction. */
187 L(zloopstart):
188 /* memset in 32-byte chunks until we get to a cache line boundary.
189 If rLEN is less than the distance to the next cache-line boundary use
190 cacheAligned1 code to finish the tail. */
191 cmpldi cr1,rLEN,128
192 beq L(medium)
193 L(getCacheAligned):
194 andi. rTMP,rMEMP,127
195 nop
196 blt cr1,L(cacheAligned1)
197 addi rMEMP3,rMEMP,32
198 beq L(cacheAligned)
199 addi rLEN,rLEN,-32
200 std rCHR,0(rMEMP)
201 std rCHR,8(rMEMP)
202 std rCHR,16(rMEMP)
203 addi rMEMP,rMEMP,32
204 andi. rTMP,rMEMP3,127
205 std rCHR,-8(rMEMP3)
206 L(getCacheAligned2):
207 beq L(cacheAligned)
208 addi rLEN,rLEN,-32
209 std rCHR,0(rMEMP3)
210 std rCHR,8(rMEMP3)
211 addi rMEMP,rMEMP,32
212 andi. rTMP,rMEMP,127
213 std rCHR,16(rMEMP3)
214 std rCHR,24(rMEMP3)
215 L(getCacheAligned3):
216 beq L(cacheAligned)
217 addi rLEN,rLEN,-32
218 std rCHR,32(rMEMP3)
219 addi rMEMP,rMEMP,32
220 cmpldi cr1,rLEN,128
221 std rCHR,40(rMEMP3)
222 cmpldi cr6,rLEN,256
223 li rMEMP2,128
224 std rCHR,48(rMEMP3)
225 std rCHR,56(rMEMP3)
226 blt cr1,L(cacheAligned1)
227 blt cr6,L(cacheAligned128)
228 b L(cacheAlignedx)
229
230 /* Now we are aligned to the cache line and can use dcbz. */
231 .align 5
232 L(cacheAligned):
233 cmpldi cr1,rLEN,128
234 cmpldi cr6,rLEN,256
235 blt cr1,L(cacheAligned1)
236 li rMEMP2,128
237 L(cacheAlignedx):
238 cmpldi cr5,rLEN,640
239 blt cr6,L(cacheAligned128)
240 bgt cr5,L(cacheAligned512)
241 cmpldi cr6,rLEN,512
242 dcbz 0,rMEMP
243 cmpldi cr1,rLEN,384
244 dcbz rMEMP2,rMEMP
245 addi rMEMP,rMEMP,256
246 addi rLEN,rLEN,-256
247 blt cr1,L(cacheAligned1)
248 blt cr6,L(cacheAligned128)
249 b L(cacheAligned256)
250 .align 5
251 /* A simple loop for the longer (>640 bytes) lengths. This form limits
252 the branch miss-predicted to exactly 1 at loop exit.*/
253 L(cacheAligned512):
254 cmpli cr1,rLEN,128
255 blt cr1,L(cacheAligned1)
256 dcbz 0,rMEMP
257 addi rLEN,rLEN,-128
258 addi rMEMP,rMEMP,128
259 b L(cacheAligned512)
260 .align 5
261 L(cacheAligned256):
262
263 cmpldi cr6,rLEN,512
264
265 dcbz 0,rMEMP
266 cmpldi cr1,rLEN,384
267 dcbz rMEMP2,rMEMP
268 addi rMEMP,rMEMP,256
269 addi rLEN,rLEN,-256
270
271 bge cr6,L(cacheAligned256)
272
273 blt cr1,L(cacheAligned1)
274 .align 4
275 L(cacheAligned128):
276 dcbz 0,rMEMP
277 addi rMEMP,rMEMP,128
278 addi rLEN,rLEN,-128
279 nop
280 L(cacheAligned1):
281 cmpldi cr1,rLEN,32
282 blt cr1,L(handletail32)
283 addi rMEMP3,rMEMP,32
284 addi rLEN,rLEN,-32
285 std rCHR,0(rMEMP)
286 std rCHR,8(rMEMP)
287 std rCHR,16(rMEMP)
288 addi rMEMP,rMEMP,32
289 cmpldi cr1,rLEN,32
290 std rCHR,-8(rMEMP3)
291 L(cacheAligned2):
292 blt cr1,L(handletail32)
293 addi rLEN,rLEN,-32
294 std rCHR,0(rMEMP3)
295 std rCHR,8(rMEMP3)
296 addi rMEMP,rMEMP,32
297 cmpldi cr1,rLEN,32
298 std rCHR,16(rMEMP3)
299 std rCHR,24(rMEMP3)
300 nop
301 L(cacheAligned3):
302 blt cr1,L(handletail32)
303 addi rMEMP,rMEMP,32
304 addi rLEN,rLEN,-32
305 std rCHR,32(rMEMP3)
306 std rCHR,40(rMEMP3)
307 std rCHR,48(rMEMP3)
308 std rCHR,56(rMEMP3)
309
310 /* We are here because the length or remainder (rLEN) is less than the
311 cache line/sector size and does not justify aggressive loop unrolling.
312 So set up the preconditions for L(medium) and go there. */
313 .align 3
314 L(handletail32):
315 cmpldi cr1,rLEN,0
316 beqlr cr1
317 b L(medium)
318
319 .align 5
320 L(small):
321 /* Memset of 8 bytes or less. */
322 cmpldi cr6, rLEN, 4
323 cmpldi cr5, rLEN, 1
324 ble cr6,L(le4)
325 subi rLEN, rLEN, 4
326 stb rCHR,0(rMEMP)
327 stb rCHR,1(rMEMP)
328 stb rCHR,2(rMEMP)
329 stb rCHR,3(rMEMP)
330 addi rMEMP,rMEMP, 4
331 cmpldi cr5, rLEN, 1
332 L(le4):
333 cmpldi cr1, rLEN, 3
334 bltlr cr5
335 stb rCHR, 0(rMEMP)
336 beqlr cr5
337 stb rCHR, 1(rMEMP)
338 bltlr cr1
339 stb rCHR, 2(rMEMP)
340 beqlr cr1
341 stb rCHR, 3(rMEMP)
342 blr
343
344 /* Memset of 0-31 bytes. */
345 .align 5
346 L(medium):
347 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
348 cmpldi cr1, rLEN, 16
349 L(medium_tail2):
350 add rMEMP, rMEMP, rLEN
351 L(medium_tail):
352 bt- 31, L(medium_31t)
353 bt- 30, L(medium_30t)
354 L(medium_30f):
355 bt 29, L(medium_29t)
356 L(medium_29f):
357 bge cr1, L(medium_27t)
358 bflr 28
359 std rCHR, -8(rMEMP)
360 blr
361
362 L(medium_31t):
363 stbu rCHR, -1(rMEMP)
364 bf- 30, L(medium_30f)
365 L(medium_30t):
366 sthu rCHR, -2(rMEMP)
367 bf- 29, L(medium_29f)
368 L(medium_29t):
369 stwu rCHR, -4(rMEMP)
370 blt cr1, L(medium_27f)
371 L(medium_27t):
372 std rCHR, -8(rMEMP)
373 stdu rCHR, -16(rMEMP)
374 L(medium_27f):
375 bflr 28
376 L(medium_28t):
377 std rCHR, -8(rMEMP)
378 blr
379 END_GEN_TB (memset,TB_TOCLESS)
380 libc_hidden_builtin_def (memset)
381
382 #ifndef NO_BZERO_IMPL
383 /* Copied from bzero.S to prevent the linker from inserting a stub
384 between bzero and memset. */
385 ENTRY (__bzero)
386 CALL_MCOUNT 3
387 mr r5,r4
388 li r4,0
389 b L(_memset)
390 END_GEN_TB (__bzero,TB_TOCLESS)
391
392 weak_alias (__bzero, bzero)
393 #endif