]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/memset.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memset.S
CommitLineData
33b8d90a 1/* Optimized memset implementation for PowerPC64/POWER7.
2b778ceb 2 Copyright (C) 2010-2021 Free Software Foundation, Inc.
33b8d90a
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
5a82c748 18 <https://www.gnu.org/licenses/>. */
33b8d90a
LM
19
20#include <sysdep.h>
33b8d90a 21
f17a4233 22/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
33b8d90a
LM
23 Returns 's'. */
24
18e0054b
WSM
25#ifndef MEMSET
26# define MEMSET memset
27#endif
33b8d90a 28 .machine power7
d5b41185 29ENTRY_TOCLESS (MEMSET, 5)
33b8d90a
LM
30 CALL_MCOUNT 3
31
32L(_memset):
33 cmpldi cr7,5,31
34 cmpldi cr6,5,8
35 mr 10,3
36
37 /* Replicate byte to word. */
3be87c77
AM
38 insrdi 4,4,8,48
39 insrdi 4,4,16,32
33b8d90a
LM
40 ble cr6,L(small) /* If length <= 8, use short copy code. */
41
42 neg 0,3
43 ble cr7,L(medium) /* If length < 32, use medium copy code. */
44
45 andi. 11,10,7 /* Check alignment of SRC. */
46 insrdi 4,4,32,0 /* Replicate word to double word. */
47
48 mr 12,5
49 beq L(big_aligned)
50
51 clrldi 0,0,61
52 mtocrf 0x01,0
53 subf 5,0,5
54
55 /* Get DST aligned to 8 bytes. */
561: bf 31,2f
57
58 stb 4,0(10)
59 addi 10,10,1
602: bf 30,4f
61
62 sth 4,0(10)
63 addi 10,10,2
644: bf 29,L(big_aligned)
65
66 stw 4,0(10)
67 addi 10,10,4
68
69 .align 4
70L(big_aligned):
71
72 cmpldi cr5,5,255
73 li 0,32
74 dcbtst 0,10
75 cmpldi cr6,4,0
76 srdi 9,5,3 /* Number of full doublewords remaining. */
77 crand 27,26,21
78 mtocrf 0x01,9
79 bt 27,L(huge)
80
81 /* From this point on, we'll copy 32+ bytes and the value
82 isn't 0 (so we can't use dcbz). */
83
84 srdi 8,5,5
85 clrldi 11,5,61
86 cmpldi cr6,11,0
87 cmpldi cr1,9,4
88 mtctr 8
89
90 /* Copy 1~3 doublewords so the main loop starts
91 at a multiple of 32 bytes. */
92
93 bf 30,1f
94
95 std 4,0(10)
96 std 4,8(10)
97 addi 10,10,16
98 bf 31,L(big_loop)
99
100 std 4,0(10)
101 addi 10,10,8
102 mr 12,10
103 blt cr1,L(tail_bytes)
104 b L(big_loop)
105
106 .align 4
1071: /* Copy 1 doubleword. */
108 bf 31,L(big_loop)
109
110 std 4,0(10)
111 addi 10,10,8
112
113 /* Main aligned copy loop. Copies 32-bytes at a time and
114 ping-pong through r10 and r12 to avoid AGEN delays. */
115 .align 4
116L(big_loop):
117 addi 12,10,32
118 std 4,0(10)
119 std 4,8(10)
120 std 4,16(10)
121 std 4,24(10)
122 bdz L(tail_bytes)
123
124 addi 10,10,64
125 std 4,0(12)
126 std 4,8(12)
127 std 4,16(12)
128 std 4,24(12)
129 bdnz L(big_loop)
130
131 mr 12,10
132 b L(tail_bytes)
133
134 .align 4
135L(tail_bytes):
136
137 /* Check for tail bytes. */
138 beqlr cr6
139
140 clrldi 0,5,61
141 mtocrf 0x01,0
142
143 /* At this point we have a tail of 0-7 bytes and we know that the
144 destination is doubleword-aligned. */
1454: /* Copy 4 bytes. */
146 bf 29,2f
147
148 stw 4,0(12)
149 addi 12,12,4
1502: /* Copy 2 bytes. */
151 bf 30,1f
152
153 sth 4,0(12)
154 addi 12,12,2
1551: /* Copy 1 byte. */
156 bflr 31
157
158 stb 4,0(12)
159 blr
160
161 /* Special case when value is 0 and we have a long length to deal
162 with. Use dcbz to zero out 128-bytes at a time. Before using
163 dcbz though, we need to get the destination 128-bytes aligned. */
164 .align 4
165L(huge):
166 andi. 11,10,127
167 neg 0,10
168 beq L(huge_aligned)
169
170 clrldi 0,0,57
171 subf 5,0,5
172 srdi 0,0,3
173 mtocrf 0x01,0
174
175 /* Get DST aligned to 128 bytes. */
1768: bf 28,4f
177
178 std 4,0(10)
179 std 4,8(10)
180 std 4,16(10)
181 std 4,24(10)
182 std 4,32(10)
183 std 4,40(10)
184 std 4,48(10)
185 std 4,56(10)
186 addi 10,10,64
187 .align 4
1884: bf 29,2f
189
190 std 4,0(10)
191 std 4,8(10)
192 std 4,16(10)
193 std 4,24(10)
194 addi 10,10,32
195 .align 4
1962: bf 30,1f
197
198 std 4,0(10)
199 std 4,8(10)
200 addi 10,10,16
201 .align 4
2021: bf 31,L(huge_aligned)
203
204 std 4,0(10)
205 addi 10,10,8
206
207
208L(huge_aligned):
209 srdi 8,5,7
210 clrldi 11,5,57
211 cmpldi cr6,11,0
212 mtctr 8
213
214 .align 4
215L(huge_loop):
216 dcbz 0,10
217 addi 10,10,128
218 bdnz L(huge_loop)
219
220 /* Check how many bytes are still left. */
221 beqlr cr6
222
223 subf 9,3,10
224 subf 5,9,12
225 srdi 8,5,3
226 cmpldi cr6,8,0
227 mtocrf 0x01,8
228
229 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
230 speed. We'll handle the resulting tail bytes later. */
231 beq cr6,L(tail)
232
2338: bf 28,4f
234
235 std 4,0(10)
236 std 4,8(10)
237 std 4,16(10)
238 std 4,24(10)
239 std 4,32(10)
240 std 4,40(10)
241 std 4,48(10)
242 std 4,56(10)
243 addi 10,10,64
244 .align 4
2454: bf 29,2f
246
247 std 4,0(10)
248 std 4,8(10)
249 std 4,16(10)
250 std 4,24(10)
251 addi 10,10,32
252 .align 4
2532: bf 30,1f
254
255 std 4,0(10)
256 std 4,8(10)
257 addi 10,10,16
258 .align 4
2591: bf 31,L(tail)
260
261 std 4,0(10)
262 addi 10,10,8
263
264 /* Handle the rest of the tail bytes here. */
265L(tail):
266 mtocrf 0x01,5
267
268 .align 4
2694: bf 29,2f
270
271 stw 4,0(10)
272 addi 10,10,4
273 .align 4
2742: bf 30,1f
275
276 sth 4,0(10)
277 addi 10,10,2
278 .align 4
2791: bflr 31
280
281 stb 4,0(10)
282 blr
283
284 /* Expanded tree to copy tail bytes without increments. */
285 .align 4
286L(copy_tail):
287 bf 29,L(FXX)
288
289 stw 4,0(10)
290 bf 30,L(TFX)
291
292 sth 4,4(10)
293 bflr 31
294
295 stb 4,6(10)
296 blr
297
298 .align 4
299L(FXX): bf 30,L(FFX)
300
301 sth 4,0(10)
302 bflr 31
303
304 stb 4,2(10)
305 blr
306
307 .align 4
308L(TFX): bflr 31
309
310 stb 4,4(10)
311 blr
312
313 .align 4
314L(FFX): bflr 31
315
316 stb 4,0(10)
317 blr
318
319 /* Handle copies of 9~31 bytes. */
320 .align 4
321L(medium):
322 /* At least 9 bytes to go. */
323 andi. 11,10,3
324 clrldi 0,0,62
325 beq L(medium_aligned)
326
3be87c77 327 /* Force 4-bytes alignment for DST. */
33b8d90a
LM
328 mtocrf 0x01,0
329 subf 5,0,5
3301: /* Copy 1 byte. */
331 bf 31,2f
332
333 stb 4,0(10)
334 addi 10,10,1
3352: /* Copy 2 bytes. */
336 bf 30,L(medium_aligned)
337
338 sth 4,0(10)
339 addi 10,10,2
340
341 .align 4
342L(medium_aligned):
343 /* At least 6 bytes to go, and DST is word-aligned. */
344 cmpldi cr1,5,16
345 mtocrf 0x01,5
346 blt cr1,8f
347
348 /* Copy 16 bytes. */
349 stw 4,0(10)
350 stw 4,4(10)
351 stw 4,8(10)
352 stw 4,12(10)
353 addi 10,10,16
3548: /* Copy 8 bytes. */
355 bf 28,4f
356
357 stw 4,0(10)
358 stw 4,4(10)
359 addi 10,10,8
3604: /* Copy 4 bytes. */
361 bf 29,2f
362
363 stw 4,0(10)
364 addi 10,10,4
3652: /* Copy 2-3 bytes. */
366 bf 30,1f
367
368 sth 4,0(10)
369 addi 10,10,2
3701: /* Copy 1 byte. */
371 bflr 31
372
373 stb 4,0(10)
374 blr
375
376 /* Handles copies of 0~8 bytes. */
377 .align 4
378L(small):
379 mtocrf 0x01,5
380 bne cr6,L(copy_tail)
381
382 stw 4,0(10)
383 stw 4,4(10)
384 blr
385
18e0054b 386END_GEN_TB (MEMSET,TB_TOCLESS)
33b8d90a
LM
387libc_hidden_builtin_def (memset)
388
389/* Copied from bzero.S to prevent the linker from inserting a stub
390 between bzero and memset. */
d5b41185 391ENTRY_TOCLESS (__bzero)
33b8d90a
LM
392 CALL_MCOUNT 3
393 mr r5,r4
394 li r4,0
395 b L(_memset)
3b473fec
AZ
396END (__bzero)
397#ifndef __bzero
2d67d91a 398weak_alias (__bzero, bzero)
8a29a3d0 399#endif