]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/memset.S
14df042785d78a289a163c91dd244e1cc138d220
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memset.S
1 /* Optimized memset implementation for PowerPC64/POWER7.
2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
23 Returns 's'. */
24
25 .machine power7
26 EALIGN (memset, 5, 0)
27 CALL_MCOUNT 3
28
29 L(_memset):
30 cmpldi cr7,5,31
31 cmpldi cr6,5,8
32 mr 10,3
33
34 /* Replicate byte to word. */
35 insrdi 4,4,8,48
36 insrdi 4,4,16,32
37 ble cr6,L(small) /* If length <= 8, use short copy code. */
38
39 neg 0,3
40 ble cr7,L(medium) /* If length < 32, use medium copy code. */
41
42 andi. 11,10,7 /* Check alignment of SRC. */
43 insrdi 4,4,32,0 /* Replicate word to double word. */
44
45 mr 12,5
46 beq L(big_aligned)
47
48 clrldi 0,0,61
49 mtocrf 0x01,0
50 subf 5,0,5
51
52 /* Get DST aligned to 8 bytes. */
53 1: bf 31,2f
54
55 stb 4,0(10)
56 addi 10,10,1
57 2: bf 30,4f
58
59 sth 4,0(10)
60 addi 10,10,2
61 4: bf 29,L(big_aligned)
62
63 stw 4,0(10)
64 addi 10,10,4
65
66 .align 4
67 L(big_aligned):
68
69 cmpldi cr5,5,255
70 li 0,32
71 dcbtst 0,10
72 cmpldi cr6,4,0
73 srdi 9,5,3 /* Number of full doublewords remaining. */
74 crand 27,26,21
75 mtocrf 0x01,9
76 bt 27,L(huge)
77
78 /* From this point on, we'll copy 32+ bytes and the value
79 isn't 0 (so we can't use dcbz). */
80
81 srdi 8,5,5
82 clrldi 11,5,61
83 cmpldi cr6,11,0
84 cmpldi cr1,9,4
85 mtctr 8
86
87 /* Copy 1~3 doublewords so the main loop starts
88 at a multiple of 32 bytes. */
89
90 bf 30,1f
91
92 std 4,0(10)
93 std 4,8(10)
94 addi 10,10,16
95 bf 31,L(big_loop)
96
97 std 4,0(10)
98 addi 10,10,8
99 mr 12,10
100 blt cr1,L(tail_bytes)
101 b L(big_loop)
102
103 .align 4
104 1: /* Copy 1 doubleword. */
105 bf 31,L(big_loop)
106
107 std 4,0(10)
108 addi 10,10,8
109
110 /* Main aligned copy loop. Copies 32-bytes at a time and
111 ping-pong through r10 and r12 to avoid AGEN delays. */
112 .align 4
113 L(big_loop):
114 addi 12,10,32
115 std 4,0(10)
116 std 4,8(10)
117 std 4,16(10)
118 std 4,24(10)
119 bdz L(tail_bytes)
120
121 addi 10,10,64
122 std 4,0(12)
123 std 4,8(12)
124 std 4,16(12)
125 std 4,24(12)
126 bdnz L(big_loop)
127
128 mr 12,10
129 b L(tail_bytes)
130
131 .align 4
132 L(tail_bytes):
133
134 /* Check for tail bytes. */
135 beqlr cr6
136
137 clrldi 0,5,61
138 mtocrf 0x01,0
139
140 /* At this point we have a tail of 0-7 bytes and we know that the
141 destination is doubleword-aligned. */
142 4: /* Copy 4 bytes. */
143 bf 29,2f
144
145 stw 4,0(12)
146 addi 12,12,4
147 2: /* Copy 2 bytes. */
148 bf 30,1f
149
150 sth 4,0(12)
151 addi 12,12,2
152 1: /* Copy 1 byte. */
153 bflr 31
154
155 stb 4,0(12)
156 blr
157
158 /* Special case when value is 0 and we have a long length to deal
159 with. Use dcbz to zero out 128-bytes at a time. Before using
160 dcbz though, we need to get the destination 128-bytes aligned. */
161 .align 4
162 L(huge):
163 andi. 11,10,127
164 neg 0,10
165 beq L(huge_aligned)
166
167 clrldi 0,0,57
168 subf 5,0,5
169 srdi 0,0,3
170 mtocrf 0x01,0
171
172 /* Get DST aligned to 128 bytes. */
173 8: bf 28,4f
174
175 std 4,0(10)
176 std 4,8(10)
177 std 4,16(10)
178 std 4,24(10)
179 std 4,32(10)
180 std 4,40(10)
181 std 4,48(10)
182 std 4,56(10)
183 addi 10,10,64
184 .align 4
185 4: bf 29,2f
186
187 std 4,0(10)
188 std 4,8(10)
189 std 4,16(10)
190 std 4,24(10)
191 addi 10,10,32
192 .align 4
193 2: bf 30,1f
194
195 std 4,0(10)
196 std 4,8(10)
197 addi 10,10,16
198 .align 4
199 1: bf 31,L(huge_aligned)
200
201 std 4,0(10)
202 addi 10,10,8
203
204
205 L(huge_aligned):
206 srdi 8,5,7
207 clrldi 11,5,57
208 cmpldi cr6,11,0
209 mtctr 8
210
211 .align 4
212 L(huge_loop):
213 dcbz 0,10
214 addi 10,10,128
215 bdnz L(huge_loop)
216
217 /* Check how many bytes are still left. */
218 beqlr cr6
219
220 subf 9,3,10
221 subf 5,9,12
222 srdi 8,5,3
223 cmpldi cr6,8,0
224 mtocrf 0x01,8
225
226 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
227 speed. We'll handle the resulting tail bytes later. */
228 beq cr6,L(tail)
229
230 8: bf 28,4f
231
232 std 4,0(10)
233 std 4,8(10)
234 std 4,16(10)
235 std 4,24(10)
236 std 4,32(10)
237 std 4,40(10)
238 std 4,48(10)
239 std 4,56(10)
240 addi 10,10,64
241 .align 4
242 4: bf 29,2f
243
244 std 4,0(10)
245 std 4,8(10)
246 std 4,16(10)
247 std 4,24(10)
248 addi 10,10,32
249 .align 4
250 2: bf 30,1f
251
252 std 4,0(10)
253 std 4,8(10)
254 addi 10,10,16
255 .align 4
256 1: bf 31,L(tail)
257
258 std 4,0(10)
259 addi 10,10,8
260
261 /* Handle the rest of the tail bytes here. */
262 L(tail):
263 mtocrf 0x01,5
264
265 .align 4
266 4: bf 29,2f
267
268 stw 4,0(10)
269 addi 10,10,4
270 .align 4
271 2: bf 30,1f
272
273 sth 4,0(10)
274 addi 10,10,2
275 .align 4
276 1: bflr 31
277
278 stb 4,0(10)
279 blr
280
281 /* Expanded tree to copy tail bytes without increments. */
282 .align 4
283 L(copy_tail):
284 bf 29,L(FXX)
285
286 stw 4,0(10)
287 bf 30,L(TFX)
288
289 sth 4,4(10)
290 bflr 31
291
292 stb 4,6(10)
293 blr
294
295 .align 4
296 L(FXX): bf 30,L(FFX)
297
298 sth 4,0(10)
299 bflr 31
300
301 stb 4,2(10)
302 blr
303
304 .align 4
305 L(TFX): bflr 31
306
307 stb 4,4(10)
308 blr
309
310 .align 4
311 L(FFX): bflr 31
312
313 stb 4,0(10)
314 blr
315
316 /* Handle copies of 9~31 bytes. */
317 .align 4
318 L(medium):
319 /* At least 9 bytes to go. */
320 andi. 11,10,3
321 clrldi 0,0,62
322 beq L(medium_aligned)
323
324 /* Force 4-bytes alignment for DST. */
325 mtocrf 0x01,0
326 subf 5,0,5
327 1: /* Copy 1 byte. */
328 bf 31,2f
329
330 stb 4,0(10)
331 addi 10,10,1
332 2: /* Copy 2 bytes. */
333 bf 30,L(medium_aligned)
334
335 sth 4,0(10)
336 addi 10,10,2
337
338 .align 4
339 L(medium_aligned):
340 /* At least 6 bytes to go, and DST is word-aligned. */
341 cmpldi cr1,5,16
342 mtocrf 0x01,5
343 blt cr1,8f
344
345 /* Copy 16 bytes. */
346 stw 4,0(10)
347 stw 4,4(10)
348 stw 4,8(10)
349 stw 4,12(10)
350 addi 10,10,16
351 8: /* Copy 8 bytes. */
352 bf 28,4f
353
354 stw 4,0(10)
355 stw 4,4(10)
356 addi 10,10,8
357 4: /* Copy 4 bytes. */
358 bf 29,2f
359
360 stw 4,0(10)
361 addi 10,10,4
362 2: /* Copy 2-3 bytes. */
363 bf 30,1f
364
365 sth 4,0(10)
366 addi 10,10,2
367 1: /* Copy 1 byte. */
368 bflr 31
369
370 stb 4,0(10)
371 blr
372
373 /* Handles copies of 0~8 bytes. */
374 .align 4
375 L(small):
376 mtocrf 0x01,5
377 bne cr6,L(copy_tail)
378
379 stw 4,0(10)
380 stw 4,4(10)
381 blr
382
383 END_GEN_TB (memset,TB_TOCLESS)
384 libc_hidden_builtin_def (memset)
385
386 /* Copied from bzero.S to prevent the linker from inserting a stub
387 between bzero and memset. */
388 ENTRY (__bzero)
389 CALL_MCOUNT 3
390 mr r5,r4
391 li r4,0
392 b L(_memset)
393 END (__bzero)
394 #ifndef __bzero
395 weak_alias (__bzero, bzero)
396 #endif