]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc32/power7/memset.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memset.S
1 /* Optimized memset implementation for PowerPC32/POWER7.
2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
23 Returns 's'. */
24
25 .machine power7
26 EALIGN (memset, 5, 0)
27 CALL_MCOUNT
28
29 .align 4
30 L(_memset):
31 cmplwi cr7,5,31
32 cmplwi cr6,5,8
33 mr 10,3 /* Save original argument for later. */
34 mr 7,1 /* Save original r1 for later. */
35 cfi_offset(31,-8)
36
37 /* Replicate byte to word. */
38 insrdi 4,4,8,48
39 insrdi 4,4,16,32
40
41 ble cr6,L(small) /* If length <= 8, use short copy code. */
42
43 neg 0,3
44 ble cr7,L(medium) /* If length < 32, use medium copy code. */
45
46 /* Save our word twice to create a doubleword that we will later
47 copy to a FPR. */
48 stwu 1,-32(1)
49 andi. 11,10,7 /* Check alignment of DST. */
50 mr 12,5
51 stw 4,24(1)
52 stw 4,28(1)
53 beq L(big_aligned)
54
55 clrlwi 0,0,29
56 mtocrf 0x01,0
57 subf 5,0,5
58
59 /* Get DST aligned to 8 bytes. */
60 1: bf 31,2f
61
62 stb 4,0(10)
63 addi 10,10,1
64 2: bf 30,4f
65
66 sth 4,0(10)
67 addi 10,10,2
68 4: bf 29,L(big_aligned)
69
70 stw 4,0(10)
71 addi 10,10,4
72
73 .align 4
74 L(big_aligned):
75 cmplwi cr5,5,255
76 li 0,32
77 cmplwi cr1,5,160
78 dcbtst 0,10
79 cmplwi cr6,4,0
80 srwi 9,5,3 /* Number of full doublewords remaining. */
81 crand 27,26,21
82 mtocrf 0x01,9
83 bt 27,L(huge)
84
85 /* From this point on, we'll copy 32+ bytes and the value
86 isn't 0 (so we can't use dcbz). */
87
88 srwi 8,5,5
89 clrlwi 11,5,29
90 cmplwi cr6,11,0
91 cmplwi cr1,9,4
92 mtctr 8
93
94 /* Copy 1~3 doublewords so the main loop starts
95 at a multiple of 32 bytes. */
96
97 bf 30,1f
98
99 stw 4,0(10)
100 stw 4,4(10)
101 stw 4,8(10)
102 stw 4,12(10)
103 addi 10,10,16
104 bf 31,L(big_loop)
105
106 stw 4,0(10)
107 stw 4,4(10)
108 addi 10,10,8
109 mr 12,10
110 blt cr1,L(tail_bytes)
111
112 b L(big_loop)
113
114 .align 4
115 1: /* Copy 1 doubleword. */
116 bf 31,L(big_loop)
117
118 stw 4,0(10)
119 stw 4,4(10)
120 addi 10,10,8
121
122 /* First use a 32-bytes loop with stw's to try and avoid the LHS due
123 to the lfd we will do next. Also, ping-pong through r10 and r12
124 to avoid AGEN delays. */
125 .align 4
126 L(big_loop):
127 addi 12,10,32
128 stw 4,0(10)
129 stw 4,4(10)
130 stw 4,8(10)
131 stw 4,12(10)
132 stw 4,16(10)
133 stw 4,20(10)
134 stw 4,24(10)
135 stw 4,28(10)
136 bdz L(tail_bytes)
137
138 addi 10,10,64
139 stw 4,0(12)
140 stw 4,4(12)
141 stw 4,8(12)
142 stw 4,12(12)
143 stw 4,16(12)
144 stw 4,20(12)
145 stw 4,24(12)
146 stw 4,28(12)
147 bdnz L(big_loop_fast_setup)
148
149 mr 12,10
150 b L(tail_bytes)
151
152 /* Now that we're probably past the LHS window, use the VSX to
153 speed up the loop. */
154 L(big_loop_fast_setup):
155 li 11,24
156 li 6,16
157 lxvdsx 4,1,11
158
159 .align 4
160 L(big_loop_fast):
161 addi 12,10,32
162 stxvd2x 4,0,10
163 stxvd2x 4,10,6
164 bdz L(tail_bytes)
165
166 addi 10,10,64
167 stxvd2x 4,0,12
168 stxvd2x 4,12,6
169 bdnz L(big_loop_fast)
170
171 mr 12,10
172
173 .align 4
174 L(tail_bytes):
175
176 /* Check for tail bytes. */
177 mr 1,7 /* Restore r1. */
178 beqlr cr6
179
180 clrlwi 0,5,29
181 mtocrf 0x01,0
182
183 /* At this point we have a tail of 0-7 bytes and we know that the
184 destination is doubleword-aligned. */
185 4: /* Copy 4 bytes. */
186 bf 29,2f
187
188 stw 4,0(12)
189 addi 12,12,4
190 2: /* Copy 2 bytes. */
191 bf 30,1f
192
193 sth 4,0(12)
194 addi 12,12,2
195 1: /* Copy 1 byte. */
196 bflr 31
197
198 stb 4,0(12)
199 blr
200
201
202 /* Special case when value is 0 and we have a long length to deal
203 with. Use dcbz to zero out 128-bytes at a time. Before using
204 dcbz though, we need to get the destination 128-bytes aligned. */
205 .align 4
206 L(huge):
207 lfd 4,24(1)
208 andi. 11,10,127
209 neg 0,10
210 beq L(huge_aligned)
211
212 clrlwi 0,0,25
213 subf 5,0,5
214 srwi 0,0,3
215 mtocrf 0x01,0
216
217 /* Get DST aligned to 128 bytes. */
218 8: bf 28,4f
219
220 stfd 4,0(10)
221 stfd 4,8(10)
222 stfd 4,16(10)
223 stfd 4,24(10)
224 stfd 4,32(10)
225 stfd 4,40(10)
226 stfd 4,48(10)
227 stfd 4,56(10)
228 addi 10,10,64
229 .align 4
230 4: bf 29,2f
231
232 stfd 4,0(10)
233 stfd 4,8(10)
234 stfd 4,16(10)
235 stfd 4,24(10)
236 addi 10,10,32
237 .align 4
238 2: bf 30,1f
239
240 stfd 4,0(10)
241 stfd 4,8(10)
242 addi 10,10,16
243 .align 4
244 1: bf 31,L(huge_aligned)
245
246 stfd 4,0(10)
247 addi 10,10,8
248
249 L(huge_aligned):
250 srwi 8,5,7
251 clrlwi 11,5,25
252 cmplwi cr6,11,0
253 mtctr 8
254
255 /* Copies 128-bytes at a time. */
256 .align 4
257 L(huge_loop):
258 dcbz 0,10
259 addi 10,10,128
260 bdnz L(huge_loop)
261
262 /* We have a tail of 0~127 bytes to handle. */
263 mr 1,7 /* Restore r1. */
264 beqlr cr6
265
266 subf 9,3,10
267 subf 5,9,12
268 srwi 8,5,3
269 cmplwi cr6,8,0
270 mtocrf 0x01,8
271
272 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
273 speed. We'll handle the resulting tail bytes later. */
274 beq cr6,L(tail)
275
276 8: bf 28,4f
277
278 stfd 4,0(10)
279 stfd 4,8(10)
280 stfd 4,16(10)
281 stfd 4,24(10)
282 stfd 4,32(10)
283 stfd 4,40(10)
284 stfd 4,48(10)
285 stfd 4,56(10)
286 addi 10,10,64
287 .align 4
288 4: bf 29,2f
289
290 stfd 4,0(10)
291 stfd 4,8(10)
292 stfd 4,16(10)
293 stfd 4,24(10)
294 addi 10,10,32
295 .align 4
296 2: bf 30,1f
297
298 stfd 4,0(10)
299 stfd 4,8(10)
300 addi 10,10,16
301 .align 4
302 1: bf 31,L(tail)
303
304 stfd 4,0(10)
305 addi 10,10,8
306
307 /* Handle the rest of the tail bytes here. */
308 L(tail):
309 mtocrf 0x01,5
310
311 .align 4
312 4: bf 29,2f
313
314 stw 4,0(10)
315 addi 10,10,4
316 .align 4
317 2: bf 30,1f
318
319 sth 4,0(10)
320 addi 10,10,2
321 .align 4
322 1: bflr 31
323
324 stb 4,0(10)
325 blr
326
327
328 /* Expanded tree to copy tail bytes without increments. */
329 .align 4
330 L(copy_tail):
331 bf 29,L(FXX)
332
333 stw 4,0(10)
334 bf 30,L(TFX)
335
336 sth 4,4(10)
337 bflr 31
338
339 stb 4,6(10)
340 blr
341
342 .align 4
343 L(FXX): bf 30,L(FFX)
344
345 sth 4,0(10)
346 bflr 31
347
348 stb 4,2(10)
349 blr
350
351 .align 4
352 L(TFX): bflr 31
353
354 stb 4,4(10)
355 blr
356
357 .align 4
358 L(FFX): bflr 31
359
360 stb 4,0(10)
361 blr
362
363 /* Handle copies of 9~31 bytes. */
364 .align 4
365 L(medium):
366 /* At least 9 bytes to go. */
367 andi. 11,10,3
368 clrlwi 0,0,30
369 beq L(medium_aligned)
370
371 /* Force 4-bytes alignment for DST. */
372 mtocrf 0x01,0
373 subf 5,0,5
374 1: /* Copy 1 byte. */
375 bf 31,2f
376
377 stb 4,0(10)
378 addi 10,10,1
379 2: /* Copy 2 bytes. */
380 bf 30,L(medium_aligned)
381
382 sth 4,0(10)
383 addi 10,10,2
384
385 .align 4
386 L(medium_aligned):
387 /* At least 6 bytes to go, and DST is word-aligned. */
388 cmplwi cr1,5,16
389 mtocrf 0x01,5
390 blt cr1,8f
391
392 /* Copy 16 bytes. */
393 stw 4,0(10)
394 stw 4,4(10)
395 stw 4,8(10)
396 stw 4,12(10)
397 addi 10,10,16
398 8: /* Copy 8 bytes. */
399 bf 28,4f
400
401 stw 4,0(10)
402 stw 4,4(10)
403 addi 10,10,8
404 4: /* Copy 4 bytes. */
405 bf 29,2f
406
407 stw 4,0(10)
408 addi 10,10,4
409 2: /* Copy 2-3 bytes. */
410 bf 30,1f
411
412 sth 4,0(10)
413 addi 10,10,2
414 1: /* Copy 1 byte. */
415 bflr 31
416
417 stb 4,0(10)
418 blr
419
420 /* Handles copies of 0~8 bytes. */
421 .align 4
422 L(small):
423 mtocrf 0x01,5
424 bne cr6,L(copy_tail)
425
426 stw 4,0(10)
427 stw 4,4(10)
428 blr
429
430 END (memset)
431 libc_hidden_builtin_def (memset)