]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/memset.S
Remove "Contributed by" lines
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memset.S
CommitLineData
33b8d90a 1/* Optimized memset implementation for PowerPC64/POWER7.
2b778ceb 2 Copyright (C) 2010-2021 Free Software Foundation, Inc.
33b8d90a
LM
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
33b8d90a
LM
18
19#include <sysdep.h>
33b8d90a 20
f17a4233 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
33b8d90a
LM
22 Returns 's'. */
23
18e0054b
WSM
24#ifndef MEMSET
25# define MEMSET memset
26#endif
33b8d90a 27 .machine power7
d5b41185 28ENTRY_TOCLESS (MEMSET, 5)
33b8d90a
LM
29 CALL_MCOUNT 3
30
31L(_memset):
32 cmpldi cr7,5,31
33 cmpldi cr6,5,8
34 mr 10,3
35
36 /* Replicate byte to word. */
3be87c77
AM
37 insrdi 4,4,8,48
38 insrdi 4,4,16,32
33b8d90a
LM
39 ble cr6,L(small) /* If length <= 8, use short copy code. */
40
41 neg 0,3
42 ble cr7,L(medium) /* If length < 32, use medium copy code. */
43
44 andi. 11,10,7 /* Check alignment of SRC. */
45 insrdi 4,4,32,0 /* Replicate word to double word. */
46
47 mr 12,5
48 beq L(big_aligned)
49
50 clrldi 0,0,61
51 mtocrf 0x01,0
52 subf 5,0,5
53
54 /* Get DST aligned to 8 bytes. */
551: bf 31,2f
56
57 stb 4,0(10)
58 addi 10,10,1
592: bf 30,4f
60
61 sth 4,0(10)
62 addi 10,10,2
634: bf 29,L(big_aligned)
64
65 stw 4,0(10)
66 addi 10,10,4
67
68 .align 4
69L(big_aligned):
70
71 cmpldi cr5,5,255
72 li 0,32
73 dcbtst 0,10
74 cmpldi cr6,4,0
75 srdi 9,5,3 /* Number of full doublewords remaining. */
76 crand 27,26,21
77 mtocrf 0x01,9
78 bt 27,L(huge)
79
80 /* From this point on, we'll copy 32+ bytes and the value
81 isn't 0 (so we can't use dcbz). */
82
83 srdi 8,5,5
84 clrldi 11,5,61
85 cmpldi cr6,11,0
86 cmpldi cr1,9,4
87 mtctr 8
88
89 /* Copy 1~3 doublewords so the main loop starts
90 at a multiple of 32 bytes. */
91
92 bf 30,1f
93
94 std 4,0(10)
95 std 4,8(10)
96 addi 10,10,16
97 bf 31,L(big_loop)
98
99 std 4,0(10)
100 addi 10,10,8
101 mr 12,10
102 blt cr1,L(tail_bytes)
103 b L(big_loop)
104
105 .align 4
1061: /* Copy 1 doubleword. */
107 bf 31,L(big_loop)
108
109 std 4,0(10)
110 addi 10,10,8
111
112 /* Main aligned copy loop. Copies 32-bytes at a time and
113 ping-pong through r10 and r12 to avoid AGEN delays. */
114 .align 4
115L(big_loop):
116 addi 12,10,32
117 std 4,0(10)
118 std 4,8(10)
119 std 4,16(10)
120 std 4,24(10)
121 bdz L(tail_bytes)
122
123 addi 10,10,64
124 std 4,0(12)
125 std 4,8(12)
126 std 4,16(12)
127 std 4,24(12)
128 bdnz L(big_loop)
129
130 mr 12,10
131 b L(tail_bytes)
132
133 .align 4
134L(tail_bytes):
135
136 /* Check for tail bytes. */
137 beqlr cr6
138
139 clrldi 0,5,61
140 mtocrf 0x01,0
141
142 /* At this point we have a tail of 0-7 bytes and we know that the
143 destination is doubleword-aligned. */
1444: /* Copy 4 bytes. */
145 bf 29,2f
146
147 stw 4,0(12)
148 addi 12,12,4
1492: /* Copy 2 bytes. */
150 bf 30,1f
151
152 sth 4,0(12)
153 addi 12,12,2
1541: /* Copy 1 byte. */
155 bflr 31
156
157 stb 4,0(12)
158 blr
159
160 /* Special case when value is 0 and we have a long length to deal
161 with. Use dcbz to zero out 128-bytes at a time. Before using
162 dcbz though, we need to get the destination 128-bytes aligned. */
163 .align 4
164L(huge):
165 andi. 11,10,127
166 neg 0,10
167 beq L(huge_aligned)
168
169 clrldi 0,0,57
170 subf 5,0,5
171 srdi 0,0,3
172 mtocrf 0x01,0
173
174 /* Get DST aligned to 128 bytes. */
1758: bf 28,4f
176
177 std 4,0(10)
178 std 4,8(10)
179 std 4,16(10)
180 std 4,24(10)
181 std 4,32(10)
182 std 4,40(10)
183 std 4,48(10)
184 std 4,56(10)
185 addi 10,10,64
186 .align 4
1874: bf 29,2f
188
189 std 4,0(10)
190 std 4,8(10)
191 std 4,16(10)
192 std 4,24(10)
193 addi 10,10,32
194 .align 4
1952: bf 30,1f
196
197 std 4,0(10)
198 std 4,8(10)
199 addi 10,10,16
200 .align 4
2011: bf 31,L(huge_aligned)
202
203 std 4,0(10)
204 addi 10,10,8
205
206
207L(huge_aligned):
208 srdi 8,5,7
209 clrldi 11,5,57
210 cmpldi cr6,11,0
211 mtctr 8
212
213 .align 4
214L(huge_loop):
215 dcbz 0,10
216 addi 10,10,128
217 bdnz L(huge_loop)
218
219 /* Check how many bytes are still left. */
220 beqlr cr6
221
222 subf 9,3,10
223 subf 5,9,12
224 srdi 8,5,3
225 cmpldi cr6,8,0
226 mtocrf 0x01,8
227
228 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
229 speed. We'll handle the resulting tail bytes later. */
230 beq cr6,L(tail)
231
2328: bf 28,4f
233
234 std 4,0(10)
235 std 4,8(10)
236 std 4,16(10)
237 std 4,24(10)
238 std 4,32(10)
239 std 4,40(10)
240 std 4,48(10)
241 std 4,56(10)
242 addi 10,10,64
243 .align 4
2444: bf 29,2f
245
246 std 4,0(10)
247 std 4,8(10)
248 std 4,16(10)
249 std 4,24(10)
250 addi 10,10,32
251 .align 4
2522: bf 30,1f
253
254 std 4,0(10)
255 std 4,8(10)
256 addi 10,10,16
257 .align 4
2581: bf 31,L(tail)
259
260 std 4,0(10)
261 addi 10,10,8
262
263 /* Handle the rest of the tail bytes here. */
264L(tail):
265 mtocrf 0x01,5
266
267 .align 4
2684: bf 29,2f
269
270 stw 4,0(10)
271 addi 10,10,4
272 .align 4
2732: bf 30,1f
274
275 sth 4,0(10)
276 addi 10,10,2
277 .align 4
2781: bflr 31
279
280 stb 4,0(10)
281 blr
282
283 /* Expanded tree to copy tail bytes without increments. */
284 .align 4
285L(copy_tail):
286 bf 29,L(FXX)
287
288 stw 4,0(10)
289 bf 30,L(TFX)
290
291 sth 4,4(10)
292 bflr 31
293
294 stb 4,6(10)
295 blr
296
297 .align 4
298L(FXX): bf 30,L(FFX)
299
300 sth 4,0(10)
301 bflr 31
302
303 stb 4,2(10)
304 blr
305
306 .align 4
307L(TFX): bflr 31
308
309 stb 4,4(10)
310 blr
311
312 .align 4
313L(FFX): bflr 31
314
315 stb 4,0(10)
316 blr
317
318 /* Handle copies of 9~31 bytes. */
319 .align 4
320L(medium):
321 /* At least 9 bytes to go. */
322 andi. 11,10,3
323 clrldi 0,0,62
324 beq L(medium_aligned)
325
3be87c77 326 /* Force 4-bytes alignment for DST. */
33b8d90a
LM
327 mtocrf 0x01,0
328 subf 5,0,5
3291: /* Copy 1 byte. */
330 bf 31,2f
331
332 stb 4,0(10)
333 addi 10,10,1
3342: /* Copy 2 bytes. */
335 bf 30,L(medium_aligned)
336
337 sth 4,0(10)
338 addi 10,10,2
339
340 .align 4
341L(medium_aligned):
342 /* At least 6 bytes to go, and DST is word-aligned. */
343 cmpldi cr1,5,16
344 mtocrf 0x01,5
345 blt cr1,8f
346
347 /* Copy 16 bytes. */
348 stw 4,0(10)
349 stw 4,4(10)
350 stw 4,8(10)
351 stw 4,12(10)
352 addi 10,10,16
3538: /* Copy 8 bytes. */
354 bf 28,4f
355
356 stw 4,0(10)
357 stw 4,4(10)
358 addi 10,10,8
3594: /* Copy 4 bytes. */
360 bf 29,2f
361
362 stw 4,0(10)
363 addi 10,10,4
3642: /* Copy 2-3 bytes. */
365 bf 30,1f
366
367 sth 4,0(10)
368 addi 10,10,2
3691: /* Copy 1 byte. */
370 bflr 31
371
372 stb 4,0(10)
373 blr
374
375 /* Handles copies of 0~8 bytes. */
376 .align 4
377L(small):
378 mtocrf 0x01,5
379 bne cr6,L(copy_tail)
380
381 stw 4,0(10)
382 stw 4,4(10)
383 blr
384
18e0054b 385END_GEN_TB (MEMSET,TB_TOCLESS)
33b8d90a
LM
386libc_hidden_builtin_def (memset)
387
388/* Copied from bzero.S to prevent the linker from inserting a stub
389 between bzero and memset. */
d5b41185 390ENTRY_TOCLESS (__bzero)
33b8d90a
LM
391 CALL_MCOUNT 3
392 mr r5,r4
393 li r4,0
394 b L(_memset)
3b473fec
AZ
395END (__bzero)
396#ifndef __bzero
2d67d91a 397weak_alias (__bzero, bzero)
8a29a3d0 398#endif