]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/memset.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memset.S
1 /* Optimized memset implementation for PowerPC64/POWER7.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
23 Returns 's'. */
24
25 #ifndef MEMSET
26 # define MEMSET memset
27 #endif
28 .machine power7
29 ENTRY_TOCLESS (MEMSET, 5)
30 CALL_MCOUNT 3
31
32 L(_memset):
33 cmpldi cr7,5,31
34 cmpldi cr6,5,8
35 mr 10,3
36
37 /* Replicate byte to word. */
38 insrdi 4,4,8,48
39 insrdi 4,4,16,32
40 ble cr6,L(small) /* If length <= 8, use short copy code. */
41
42 neg 0,3
43 ble cr7,L(medium) /* If length < 32, use medium copy code. */
44
45 andi. 11,10,7 /* Check alignment of SRC. */
46 insrdi 4,4,32,0 /* Replicate word to double word. */
47
48 mr 12,5
49 beq L(big_aligned)
50
51 clrldi 0,0,61
52 mtocrf 0x01,0
53 subf 5,0,5
54
55 /* Get DST aligned to 8 bytes. */
56 1: bf 31,2f
57
58 stb 4,0(10)
59 addi 10,10,1
60 2: bf 30,4f
61
62 sth 4,0(10)
63 addi 10,10,2
64 4: bf 29,L(big_aligned)
65
66 stw 4,0(10)
67 addi 10,10,4
68
69 .align 4
70 L(big_aligned):
71
72 cmpldi cr5,5,255
73 li 0,32
74 dcbtst 0,10
75 cmpldi cr6,4,0
76 srdi 9,5,3 /* Number of full doublewords remaining. */
77 crand 27,26,21
78 mtocrf 0x01,9
79 bt 27,L(huge)
80
81 /* From this point on, we'll copy 32+ bytes and the value
82 isn't 0 (so we can't use dcbz). */
83
84 srdi 8,5,5
85 clrldi 11,5,61
86 cmpldi cr6,11,0
87 cmpldi cr1,9,4
88 mtctr 8
89
90 /* Copy 1~3 doublewords so the main loop starts
91 at a multiple of 32 bytes. */
92
93 bf 30,1f
94
95 std 4,0(10)
96 std 4,8(10)
97 addi 10,10,16
98 bf 31,L(big_loop)
99
100 std 4,0(10)
101 addi 10,10,8
102 mr 12,10
103 blt cr1,L(tail_bytes)
104 b L(big_loop)
105
106 .align 4
107 1: /* Copy 1 doubleword. */
108 bf 31,L(big_loop)
109
110 std 4,0(10)
111 addi 10,10,8
112
113 /* Main aligned copy loop. Copies 32-bytes at a time and
114 ping-pong through r10 and r12 to avoid AGEN delays. */
115 .align 4
116 L(big_loop):
117 addi 12,10,32
118 std 4,0(10)
119 std 4,8(10)
120 std 4,16(10)
121 std 4,24(10)
122 bdz L(tail_bytes)
123
124 addi 10,10,64
125 std 4,0(12)
126 std 4,8(12)
127 std 4,16(12)
128 std 4,24(12)
129 bdnz L(big_loop)
130
131 mr 12,10
132 b L(tail_bytes)
133
134 .align 4
135 L(tail_bytes):
136
137 /* Check for tail bytes. */
138 beqlr cr6
139
140 clrldi 0,5,61
141 mtocrf 0x01,0
142
143 /* At this point we have a tail of 0-7 bytes and we know that the
144 destination is doubleword-aligned. */
145 4: /* Copy 4 bytes. */
146 bf 29,2f
147
148 stw 4,0(12)
149 addi 12,12,4
150 2: /* Copy 2 bytes. */
151 bf 30,1f
152
153 sth 4,0(12)
154 addi 12,12,2
155 1: /* Copy 1 byte. */
156 bflr 31
157
158 stb 4,0(12)
159 blr
160
161 /* Special case when value is 0 and we have a long length to deal
162 with. Use dcbz to zero out 128-bytes at a time. Before using
163 dcbz though, we need to get the destination 128-bytes aligned. */
164 .align 4
165 L(huge):
166 andi. 11,10,127
167 neg 0,10
168 beq L(huge_aligned)
169
170 clrldi 0,0,57
171 subf 5,0,5
172 srdi 0,0,3
173 mtocrf 0x01,0
174
175 /* Get DST aligned to 128 bytes. */
176 8: bf 28,4f
177
178 std 4,0(10)
179 std 4,8(10)
180 std 4,16(10)
181 std 4,24(10)
182 std 4,32(10)
183 std 4,40(10)
184 std 4,48(10)
185 std 4,56(10)
186 addi 10,10,64
187 .align 4
188 4: bf 29,2f
189
190 std 4,0(10)
191 std 4,8(10)
192 std 4,16(10)
193 std 4,24(10)
194 addi 10,10,32
195 .align 4
196 2: bf 30,1f
197
198 std 4,0(10)
199 std 4,8(10)
200 addi 10,10,16
201 .align 4
202 1: bf 31,L(huge_aligned)
203
204 std 4,0(10)
205 addi 10,10,8
206
207
208 L(huge_aligned):
209 srdi 8,5,7
210 clrldi 11,5,57
211 cmpldi cr6,11,0
212 mtctr 8
213
214 .align 4
215 L(huge_loop):
216 dcbz 0,10
217 addi 10,10,128
218 bdnz L(huge_loop)
219
220 /* Check how many bytes are still left. */
221 beqlr cr6
222
223 subf 9,3,10
224 subf 5,9,12
225 srdi 8,5,3
226 cmpldi cr6,8,0
227 mtocrf 0x01,8
228
229 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
230 speed. We'll handle the resulting tail bytes later. */
231 beq cr6,L(tail)
232
233 8: bf 28,4f
234
235 std 4,0(10)
236 std 4,8(10)
237 std 4,16(10)
238 std 4,24(10)
239 std 4,32(10)
240 std 4,40(10)
241 std 4,48(10)
242 std 4,56(10)
243 addi 10,10,64
244 .align 4
245 4: bf 29,2f
246
247 std 4,0(10)
248 std 4,8(10)
249 std 4,16(10)
250 std 4,24(10)
251 addi 10,10,32
252 .align 4
253 2: bf 30,1f
254
255 std 4,0(10)
256 std 4,8(10)
257 addi 10,10,16
258 .align 4
259 1: bf 31,L(tail)
260
261 std 4,0(10)
262 addi 10,10,8
263
264 /* Handle the rest of the tail bytes here. */
265 L(tail):
266 mtocrf 0x01,5
267
268 .align 4
269 4: bf 29,2f
270
271 stw 4,0(10)
272 addi 10,10,4
273 .align 4
274 2: bf 30,1f
275
276 sth 4,0(10)
277 addi 10,10,2
278 .align 4
279 1: bflr 31
280
281 stb 4,0(10)
282 blr
283
284 /* Expanded tree to copy tail bytes without increments. */
285 .align 4
286 L(copy_tail):
287 bf 29,L(FXX)
288
289 stw 4,0(10)
290 bf 30,L(TFX)
291
292 sth 4,4(10)
293 bflr 31
294
295 stb 4,6(10)
296 blr
297
298 .align 4
299 L(FXX): bf 30,L(FFX)
300
301 sth 4,0(10)
302 bflr 31
303
304 stb 4,2(10)
305 blr
306
307 .align 4
308 L(TFX): bflr 31
309
310 stb 4,4(10)
311 blr
312
313 .align 4
314 L(FFX): bflr 31
315
316 stb 4,0(10)
317 blr
318
319 /* Handle copies of 9~31 bytes. */
320 .align 4
321 L(medium):
322 /* At least 9 bytes to go. */
323 andi. 11,10,3
324 clrldi 0,0,62
325 beq L(medium_aligned)
326
327 /* Force 4-bytes alignment for DST. */
328 mtocrf 0x01,0
329 subf 5,0,5
330 1: /* Copy 1 byte. */
331 bf 31,2f
332
333 stb 4,0(10)
334 addi 10,10,1
335 2: /* Copy 2 bytes. */
336 bf 30,L(medium_aligned)
337
338 sth 4,0(10)
339 addi 10,10,2
340
341 .align 4
342 L(medium_aligned):
343 /* At least 6 bytes to go, and DST is word-aligned. */
344 cmpldi cr1,5,16
345 mtocrf 0x01,5
346 blt cr1,8f
347
348 /* Copy 16 bytes. */
349 stw 4,0(10)
350 stw 4,4(10)
351 stw 4,8(10)
352 stw 4,12(10)
353 addi 10,10,16
354 8: /* Copy 8 bytes. */
355 bf 28,4f
356
357 stw 4,0(10)
358 stw 4,4(10)
359 addi 10,10,8
360 4: /* Copy 4 bytes. */
361 bf 29,2f
362
363 stw 4,0(10)
364 addi 10,10,4
365 2: /* Copy 2-3 bytes. */
366 bf 30,1f
367
368 sth 4,0(10)
369 addi 10,10,2
370 1: /* Copy 1 byte. */
371 bflr 31
372
373 stb 4,0(10)
374 blr
375
376 /* Handles copies of 0~8 bytes. */
377 .align 4
378 L(small):
379 mtocrf 0x01,5
380 bne cr6,L(copy_tail)
381
382 stw 4,0(10)
383 stw 4,4(10)
384 blr
385
386 END_GEN_TB (MEMSET,TB_TOCLESS)
387 libc_hidden_builtin_def (memset)
388
389 /* Copied from bzero.S to prevent the linker from inserting a stub
390 between bzero and memset. */
391 ENTRY_TOCLESS (__bzero)
392 CALL_MCOUNT 3
393 mr r5,r4
394 li r4,0
395 b L(_memset)
396 END (__bzero)
397 #ifndef __bzero
398 weak_alias (__bzero, bzero)
399 #endif