]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power6/memset.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memset.S
CommitLineData
a88f47a7 1/* Optimized 64-bit memset implementation for POWER6.
04277e02 2 Copyright (C) 1997-2019 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
f17a4233 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
04067002
UD
22 Returns 's'.
23
24 The memset is done in three sizes: byte (8 bits), word (32 bits),
25 cache line (256 bits). There is a special case for setting cache lines
26 to 0, to take advantage of the dcbz instruction. */
27
18e0054b
WSM
28#ifndef MEMSET
29# define MEMSET memset
30#endif
a88f47a7 31 .machine power6
d5b41185 32ENTRY_TOCLESS (MEMSET, 7)
04067002
UD
33 CALL_MCOUNT 3
34
35#define rTMP r0
36#define rRTN r3 /* Initial value of 1st argument. */
2d67d91a
JM
37#define rMEMP0 r3 /* Original value of 1st arg. */
38#define rCHR r4 /* Char to set in each byte. */
39#define rLEN r5 /* Length of region to set. */
40#define rMEMP r6 /* Address at which we are storing. */
04067002
UD
41#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
42#define rMEMP2 r8
a88f47a7 43#define rMEMP3 r9 /* Alt mem pointer. */
04067002 44L(_memset):
04067002
UD
45/* Take care of case for size <= 4. */
46 cmpldi cr1, rLEN, 8
47 andi. rALIGN, rMEMP0, 7
48 mr rMEMP, rMEMP0
a88f47a7 49 ble cr1, L(small)
04067002
UD
50
51/* Align to doubleword boundary. */
52 cmpldi cr5, rLEN, 31
3be87c77 53 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
04067002
UD
54 beq+ L(aligned2)
55 mtcrf 0x01, rMEMP0
56 subfic rALIGN, rALIGN, 8
57 cror 28,30,31 /* Detect odd word aligned. */
58 add rMEMP, rMEMP, rALIGN
59 sub rLEN, rLEN, rALIGN
3be87c77 60 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
04067002
UD
61 bt 29, L(g4)
62/* Process the even word of doubleword. */
63 bf+ 31, L(g2)
64 stb rCHR, 0(rMEMP0)
65 bt 30, L(g4x)
66L(g2):
67 sth rCHR, -6(rMEMP)
68L(g4x):
69 stw rCHR, -4(rMEMP)
70 b L(aligned)
71/* Process the odd word of doubleword. */
72L(g4):
73 bf 28, L(g4x) /* If false, word aligned on odd word. */
74 bf+ 31, L(g0)
75 stb rCHR, 0(rMEMP0)
76 bt 30, L(aligned)
77L(g0):
78 sth rCHR, -2(rMEMP)
79
80/* Handle the case of size < 31. */
81L(aligned2):
3be87c77 82 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
04067002
UD
83L(aligned):
84 mtcrf 0x01, rLEN
85 ble cr5, L(medium)
86/* Align to 32-byte boundary. */
87 andi. rALIGN, rMEMP, 0x18
88 subfic rALIGN, rALIGN, 0x20
3be87c77 89 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
04067002
UD
90 beq L(caligned)
91 mtcrf 0x01, rALIGN
92 add rMEMP, rMEMP, rALIGN
93 sub rLEN, rLEN, rALIGN
94 cmplwi cr1, rALIGN, 0x10
95 mr rMEMP2, rMEMP
96 bf 28, L(a1)
97 stdu rCHR, -8(rMEMP2)
98L(a1): blt cr1, L(a2)
99 std rCHR, -8(rMEMP2)
100 stdu rCHR, -16(rMEMP2)
101L(a2):
102
103/* Now aligned to a 32 byte boundary. */
104 .align 4
105L(caligned):
106 cmpldi cr1, rCHR, 0
107 clrrdi. rALIGN, rLEN, 5
108 mtcrf 0x01, rLEN
109 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
04067002 110 beq L(medium) /* We may not actually get to do a full line. */
a88f47a7
UD
111 .align 4
112/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
113 boundary may not be at cache line (128-byte) boundary. */
114L(nzloopstart):
115/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08 116 If rLEN is less than the distance to the next cache-line boundary use
a88f47a7
UD
117 cacheAligned1 code to finish the tail. */
118 cmpldi cr1,rLEN,128
04067002 119
a88f47a7
UD
120 andi. rTMP,rMEMP,127
121 blt cr1,L(cacheAligned1)
122 addi rMEMP3,rMEMP,32
123 beq L(nzCacheAligned)
124 addi rLEN,rLEN,-32
125 std rCHR,0(rMEMP)
126 std rCHR,8(rMEMP)
127 std rCHR,16(rMEMP)
128 addi rMEMP,rMEMP,32
129 andi. rTMP,rMEMP3,127
130 std rCHR,-8(rMEMP3)
04067002 131
a88f47a7
UD
132 beq L(nzCacheAligned)
133 addi rLEN,rLEN,-32
134 std rCHR,0(rMEMP3)
135 addi rMEMP,rMEMP,32
136 std rCHR,8(rMEMP3)
137 andi. rTMP,rMEMP,127
138 std rCHR,16(rMEMP3)
139 std rCHR,24(rMEMP3)
140
141 beq L(nzCacheAligned)
142 addi rLEN,rLEN,-32
143 std rCHR,32(rMEMP3)
144 addi rMEMP,rMEMP,32
145 cmpldi cr1,rLEN,128
146 std rCHR,40(rMEMP3)
147 cmpldi cr6,rLEN,256
148 li rMEMP2,128
149 std rCHR,48(rMEMP3)
150 std rCHR,56(rMEMP3)
151 blt cr1,L(cacheAligned1)
152 b L(nzCacheAligned128)
153
154/* Now we are aligned to the cache line and can use dcbtst. */
155 .align 4
156L(nzCacheAligned):
157 cmpldi cr1,rLEN,128
158 blt cr1,L(cacheAligned1)
159 b L(nzCacheAligned128)
160 .align 5
161L(nzCacheAligned128):
162 cmpldi cr1,rLEN,256
163 addi rMEMP3,rMEMP,64
164 std rCHR,0(rMEMP)
165 std rCHR,8(rMEMP)
166 std rCHR,16(rMEMP)
167 std rCHR,24(rMEMP)
168 std rCHR,32(rMEMP)
169 std rCHR,40(rMEMP)
170 std rCHR,48(rMEMP)
171 std rCHR,56(rMEMP)
172 addi rMEMP,rMEMP3,64
173 addi rLEN,rLEN,-128
174 std rCHR,0(rMEMP3)
175 std rCHR,8(rMEMP3)
176 std rCHR,16(rMEMP3)
177 std rCHR,24(rMEMP3)
178 std rCHR,32(rMEMP3)
179 std rCHR,40(rMEMP3)
180 std rCHR,48(rMEMP3)
181 std rCHR,56(rMEMP3)
182 bge cr1,L(nzCacheAligned128)
183 dcbtst 0,rMEMP
184 b L(cacheAligned1)
04067002 185 .align 5
a88f47a7
UD
186/* Storing a zero "c" value. We are aligned at a sector (32-byte)
187 boundary but may not be at cache line (128-byte) boundary. If the
188 remaining length spans a full cache line we can use the Data cache
189 block zero instruction. */
04067002 190L(zloopstart):
a88f47a7 191/* memset in 32-byte chunks until we get to a cache line boundary.
f24a6d08 192 If rLEN is less than the distance to the next cache-line boundary use
a88f47a7
UD
193 cacheAligned1 code to finish the tail. */
194 cmpldi cr1,rLEN,128
04067002 195 beq L(medium)
04067002 196L(getCacheAligned):
04067002 197 andi. rTMP,rMEMP,127
a88f47a7
UD
198 nop
199 blt cr1,L(cacheAligned1)
200 addi rMEMP3,rMEMP,32
201 beq L(cacheAligned)
202 addi rLEN,rLEN,-32
203 std rCHR,0(rMEMP)
204 std rCHR,8(rMEMP)
205 std rCHR,16(rMEMP)
04067002 206 addi rMEMP,rMEMP,32
a88f47a7
UD
207 andi. rTMP,rMEMP3,127
208 std rCHR,-8(rMEMP3)
209L(getCacheAligned2):
210 beq L(cacheAligned)
04067002 211 addi rLEN,rLEN,-32
a88f47a7
UD
212 std rCHR,0(rMEMP3)
213 std rCHR,8(rMEMP3)
214 addi rMEMP,rMEMP,32
215 andi. rTMP,rMEMP,127
216 std rCHR,16(rMEMP3)
217 std rCHR,24(rMEMP3)
218L(getCacheAligned3):
219 beq L(cacheAligned)
220 addi rLEN,rLEN,-32
221 std rCHR,32(rMEMP3)
222 addi rMEMP,rMEMP,32
223 cmpldi cr1,rLEN,128
224 std rCHR,40(rMEMP3)
225 cmpldi cr6,rLEN,256
226 li rMEMP2,128
227 std rCHR,48(rMEMP3)
228 std rCHR,56(rMEMP3)
229 blt cr1,L(cacheAligned1)
230 blt cr6,L(cacheAligned128)
231 b L(cacheAlignedx)
04067002
UD
232
233/* Now we are aligned to the cache line and can use dcbz. */
a88f47a7 234 .align 5
04067002 235L(cacheAligned):
a88f47a7
UD
236 cmpldi cr1,rLEN,128
237 cmpldi cr6,rLEN,256
238 blt cr1,L(cacheAligned1)
239 li rMEMP2,128
240L(cacheAlignedx):
241 cmpldi cr5,rLEN,640
242 blt cr6,L(cacheAligned128)
243 bgt cr5,L(cacheAligned512)
244 cmpldi cr6,rLEN,512
245 dcbz 0,rMEMP
246 cmpldi cr1,rLEN,384
247 dcbz rMEMP2,rMEMP
248 addi rMEMP,rMEMP,256
249 addi rLEN,rLEN,-256
250 blt cr1,L(cacheAligned1)
251 blt cr6,L(cacheAligned128)
252 b L(cacheAligned256)
253 .align 5
254/* A simple loop for the longer (>640 bytes) lengths. This form limits
255 the branch miss-predicted to exactly 1 at loop exit.*/
256L(cacheAligned512):
78b7adba 257 cmpldi cr1,rLEN,128
a88f47a7
UD
258 blt cr1,L(cacheAligned1)
259 dcbz 0,rMEMP
260 addi rLEN,rLEN,-128
261 addi rMEMP,rMEMP,128
262 b L(cacheAligned512)
263 .align 5
264L(cacheAligned256):
265
266 cmpldi cr6,rLEN,512
267
04067002 268 dcbz 0,rMEMP
a88f47a7
UD
269 cmpldi cr1,rLEN,384
270 dcbz rMEMP2,rMEMP
271 addi rMEMP,rMEMP,256
272 addi rLEN,rLEN,-256
273
274 bge cr6,L(cacheAligned256)
275
276 blt cr1,L(cacheAligned1)
277 .align 4
278L(cacheAligned128):
279 dcbz 0,rMEMP
280 addi rMEMP,rMEMP,128
281 addi rLEN,rLEN,-128
282 nop
283L(cacheAligned1):
284 cmpldi cr1,rLEN,32
285 blt cr1,L(handletail32)
286 addi rMEMP3,rMEMP,32
287 addi rLEN,rLEN,-32
288 std rCHR,0(rMEMP)
289 std rCHR,8(rMEMP)
290 std rCHR,16(rMEMP)
291 addi rMEMP,rMEMP,32
292 cmpldi cr1,rLEN,32
293 std rCHR,-8(rMEMP3)
294L(cacheAligned2):
295 blt cr1,L(handletail32)
296 addi rLEN,rLEN,-32
297 std rCHR,0(rMEMP3)
298 std rCHR,8(rMEMP3)
299 addi rMEMP,rMEMP,32
300 cmpldi cr1,rLEN,32
301 std rCHR,16(rMEMP3)
302 std rCHR,24(rMEMP3)
303 nop
304L(cacheAligned3):
305 blt cr1,L(handletail32)
306 addi rMEMP,rMEMP,32
307 addi rLEN,rLEN,-32
308 std rCHR,32(rMEMP3)
309 std rCHR,40(rMEMP3)
310 std rCHR,48(rMEMP3)
311 std rCHR,56(rMEMP3)
04067002 312
a88f47a7
UD
313/* We are here because the length or remainder (rLEN) is less than the
314 cache line/sector size and does not justify aggressive loop unrolling.
315 So set up the preconditions for L(medium) and go there. */
04067002
UD
316 .align 3
317L(handletail32):
a88f47a7
UD
318 cmpldi cr1,rLEN,0
319 beqlr cr1
320 b L(medium)
04067002
UD
321
322 .align 5
323L(small):
324/* Memset of 8 bytes or less. */
325 cmpldi cr6, rLEN, 4
326 cmpldi cr5, rLEN, 1
327 ble cr6,L(le4)
328 subi rLEN, rLEN, 4
329 stb rCHR,0(rMEMP)
330 stb rCHR,1(rMEMP)
331 stb rCHR,2(rMEMP)
332 stb rCHR,3(rMEMP)
333 addi rMEMP,rMEMP, 4
334 cmpldi cr5, rLEN, 1
335L(le4):
336 cmpldi cr1, rLEN, 3
337 bltlr cr5
338 stb rCHR, 0(rMEMP)
339 beqlr cr5
340 stb rCHR, 1(rMEMP)
341 bltlr cr1
342 stb rCHR, 2(rMEMP)
343 beqlr cr1
344 stb rCHR, 3(rMEMP)
345 blr
346
347/* Memset of 0-31 bytes. */
348 .align 5
349L(medium):
3be87c77 350 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
04067002
UD
351 cmpldi cr1, rLEN, 16
352L(medium_tail2):
353 add rMEMP, rMEMP, rLEN
354L(medium_tail):
355 bt- 31, L(medium_31t)
356 bt- 30, L(medium_30t)
357L(medium_30f):
a88f47a7 358 bt 29, L(medium_29t)
04067002 359L(medium_29f):
a88f47a7
UD
360 bge cr1, L(medium_27t)
361 bflr 28
04067002
UD
362 std rCHR, -8(rMEMP)
363 blr
364
365L(medium_31t):
366 stbu rCHR, -1(rMEMP)
367 bf- 30, L(medium_30f)
368L(medium_30t):
369 sthu rCHR, -2(rMEMP)
370 bf- 29, L(medium_29f)
371L(medium_29t):
372 stwu rCHR, -4(rMEMP)
a88f47a7 373 blt cr1, L(medium_27f)
04067002
UD
374L(medium_27t):
375 std rCHR, -8(rMEMP)
376 stdu rCHR, -16(rMEMP)
377L(medium_27f):
a88f47a7 378 bflr 28
04067002
UD
379L(medium_28t):
380 std rCHR, -8(rMEMP)
381 blr
18e0054b 382END_GEN_TB (MEMSET,TB_TOCLESS)
04067002
UD
383libc_hidden_builtin_def (memset)
384
385/* Copied from bzero.S to prevent the linker from inserting a stub
386 between bzero and memset. */
d5b41185 387ENTRY_TOCLESS (__bzero)
04067002 388 CALL_MCOUNT 3
04067002
UD
389 mr r5,r4
390 li r4,0
391 b L(_memset)
3b473fec
AZ
392END (__bzero)
393#ifndef __bzero
2d67d91a 394weak_alias (__bzero, bzero)
8a29a3d0 395#endif