]>
Commit | Line | Data |
---|---|---|
a88f47a7 | 1 | /* Optimized 64-bit memset implementation for POWER6. |
b168057a | 2 | Copyright (C) 1997-2015 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
04067002 UD |
21 | /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); |
22 | Returns 's'. | |
23 | ||
24 | The memset is done in three sizes: byte (8 bits), word (32 bits), | |
25 | cache line (256 bits). There is a special case for setting cache lines | |
26 | to 0, to take advantage of the dcbz instruction. */ | |
27 | ||
a88f47a7 | 28 | .machine power6 |
2d67d91a | 29 | EALIGN (memset, 7, 0) |
04067002 UD |
30 | CALL_MCOUNT 3 |
31 | ||
32 | #define rTMP r0 | |
33 | #define rRTN r3 /* Initial value of 1st argument. */ | |
2d67d91a JM |
34 | #define rMEMP0 r3 /* Original value of 1st arg. */ |
35 | #define rCHR r4 /* Char to set in each byte. */ | |
36 | #define rLEN r5 /* Length of region to set. */ | |
37 | #define rMEMP r6 /* Address at which we are storing. */ | |
04067002 UD |
38 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ |
39 | #define rMEMP2 r8 | |
a88f47a7 | 40 | #define rMEMP3 r9 /* Alt mem pointer. */ |
04067002 | 41 | L(_memset): |
04067002 UD |
42 | /* Take care of case for size <= 4. */ |
43 | cmpldi cr1, rLEN, 8 | |
44 | andi. rALIGN, rMEMP0, 7 | |
45 | mr rMEMP, rMEMP0 | |
a88f47a7 | 46 | ble cr1, L(small) |
04067002 UD |
47 | |
48 | /* Align to doubleword boundary. */ | |
49 | cmpldi cr5, rLEN, 31 | |
3be87c77 | 50 | insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ |
04067002 UD |
51 | beq+ L(aligned2) |
52 | mtcrf 0x01, rMEMP0 | |
53 | subfic rALIGN, rALIGN, 8 | |
54 | cror 28,30,31 /* Detect odd word aligned. */ | |
55 | add rMEMP, rMEMP, rALIGN | |
56 | sub rLEN, rLEN, rALIGN | |
3be87c77 | 57 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
04067002 UD |
58 | bt 29, L(g4) |
59 | /* Process the even word of doubleword. */ | |
60 | bf+ 31, L(g2) | |
61 | stb rCHR, 0(rMEMP0) | |
62 | bt 30, L(g4x) | |
63 | L(g2): | |
64 | sth rCHR, -6(rMEMP) | |
65 | L(g4x): | |
66 | stw rCHR, -4(rMEMP) | |
67 | b L(aligned) | |
68 | /* Process the odd word of doubleword. */ | |
69 | L(g4): | |
70 | bf 28, L(g4x) /* If false, word aligned on odd word. */ | |
71 | bf+ 31, L(g0) | |
72 | stb rCHR, 0(rMEMP0) | |
73 | bt 30, L(aligned) | |
74 | L(g0): | |
75 | sth rCHR, -2(rMEMP) | |
76 | ||
77 | /* Handle the case of size < 31. */ | |
78 | L(aligned2): | |
3be87c77 | 79 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
04067002 UD |
80 | L(aligned): |
81 | mtcrf 0x01, rLEN | |
82 | ble cr5, L(medium) | |
83 | /* Align to 32-byte boundary. */ | |
84 | andi. rALIGN, rMEMP, 0x18 | |
85 | subfic rALIGN, rALIGN, 0x20 | |
3be87c77 | 86 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
04067002 UD |
87 | beq L(caligned) |
88 | mtcrf 0x01, rALIGN | |
89 | add rMEMP, rMEMP, rALIGN | |
90 | sub rLEN, rLEN, rALIGN | |
91 | cmplwi cr1, rALIGN, 0x10 | |
92 | mr rMEMP2, rMEMP | |
93 | bf 28, L(a1) | |
94 | stdu rCHR, -8(rMEMP2) | |
95 | L(a1): blt cr1, L(a2) | |
96 | std rCHR, -8(rMEMP2) | |
97 | stdu rCHR, -16(rMEMP2) | |
98 | L(a2): | |
99 | ||
100 | /* Now aligned to a 32 byte boundary. */ | |
101 | .align 4 | |
102 | L(caligned): | |
103 | cmpldi cr1, rCHR, 0 | |
104 | clrrdi. rALIGN, rLEN, 5 | |
105 | mtcrf 0x01, rLEN | |
106 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ | |
04067002 | 107 | beq L(medium) /* We may not actually get to do a full line. */ |
a88f47a7 UD |
108 | .align 4 |
109 | /* Storing a non-zero "c" value. We are aligned at a sector (32-byte) | |
110 | boundary may not be at cache line (128-byte) boundary. */ | |
111 | L(nzloopstart): | |
112 | /* memset in 32-byte chunks until we get to a cache line boundary. | |
f24a6d08 | 113 | If rLEN is less than the distance to the next cache-line boundary use |
a88f47a7 UD |
114 | cacheAligned1 code to finish the tail. */ |
115 | cmpldi cr1,rLEN,128 | |
04067002 | 116 | |
a88f47a7 UD |
117 | andi. rTMP,rMEMP,127 |
118 | blt cr1,L(cacheAligned1) | |
119 | addi rMEMP3,rMEMP,32 | |
120 | beq L(nzCacheAligned) | |
121 | addi rLEN,rLEN,-32 | |
122 | std rCHR,0(rMEMP) | |
123 | std rCHR,8(rMEMP) | |
124 | std rCHR,16(rMEMP) | |
125 | addi rMEMP,rMEMP,32 | |
126 | andi. rTMP,rMEMP3,127 | |
127 | std rCHR,-8(rMEMP3) | |
04067002 | 128 | |
a88f47a7 UD |
129 | beq L(nzCacheAligned) |
130 | addi rLEN,rLEN,-32 | |
131 | std rCHR,0(rMEMP3) | |
132 | addi rMEMP,rMEMP,32 | |
133 | std rCHR,8(rMEMP3) | |
134 | andi. rTMP,rMEMP,127 | |
135 | std rCHR,16(rMEMP3) | |
136 | std rCHR,24(rMEMP3) | |
137 | ||
138 | beq L(nzCacheAligned) | |
139 | addi rLEN,rLEN,-32 | |
140 | std rCHR,32(rMEMP3) | |
141 | addi rMEMP,rMEMP,32 | |
142 | cmpldi cr1,rLEN,128 | |
143 | std rCHR,40(rMEMP3) | |
144 | cmpldi cr6,rLEN,256 | |
145 | li rMEMP2,128 | |
146 | std rCHR,48(rMEMP3) | |
147 | std rCHR,56(rMEMP3) | |
148 | blt cr1,L(cacheAligned1) | |
149 | b L(nzCacheAligned128) | |
150 | ||
151 | /* Now we are aligned to the cache line and can use dcbtst. */ | |
152 | .align 4 | |
153 | L(nzCacheAligned): | |
154 | cmpldi cr1,rLEN,128 | |
155 | blt cr1,L(cacheAligned1) | |
156 | b L(nzCacheAligned128) | |
157 | .align 5 | |
158 | L(nzCacheAligned128): | |
159 | cmpldi cr1,rLEN,256 | |
160 | addi rMEMP3,rMEMP,64 | |
161 | std rCHR,0(rMEMP) | |
162 | std rCHR,8(rMEMP) | |
163 | std rCHR,16(rMEMP) | |
164 | std rCHR,24(rMEMP) | |
165 | std rCHR,32(rMEMP) | |
166 | std rCHR,40(rMEMP) | |
167 | std rCHR,48(rMEMP) | |
168 | std rCHR,56(rMEMP) | |
169 | addi rMEMP,rMEMP3,64 | |
170 | addi rLEN,rLEN,-128 | |
171 | std rCHR,0(rMEMP3) | |
172 | std rCHR,8(rMEMP3) | |
173 | std rCHR,16(rMEMP3) | |
174 | std rCHR,24(rMEMP3) | |
175 | std rCHR,32(rMEMP3) | |
176 | std rCHR,40(rMEMP3) | |
177 | std rCHR,48(rMEMP3) | |
178 | std rCHR,56(rMEMP3) | |
179 | bge cr1,L(nzCacheAligned128) | |
180 | dcbtst 0,rMEMP | |
181 | b L(cacheAligned1) | |
04067002 | 182 | .align 5 |
a88f47a7 UD |
183 | /* Storing a zero "c" value. We are aligned at a sector (32-byte) |
184 | boundary but may not be at cache line (128-byte) boundary. If the | |
185 | remaining length spans a full cache line we can use the Data cache | |
186 | block zero instruction. */ | |
04067002 | 187 | L(zloopstart): |
a88f47a7 | 188 | /* memset in 32-byte chunks until we get to a cache line boundary. |
f24a6d08 | 189 | If rLEN is less than the distance to the next cache-line boundary use |
a88f47a7 UD |
190 | cacheAligned1 code to finish the tail. */ |
191 | cmpldi cr1,rLEN,128 | |
04067002 | 192 | beq L(medium) |
04067002 | 193 | L(getCacheAligned): |
04067002 | 194 | andi. rTMP,rMEMP,127 |
a88f47a7 UD |
195 | nop |
196 | blt cr1,L(cacheAligned1) | |
197 | addi rMEMP3,rMEMP,32 | |
198 | beq L(cacheAligned) | |
199 | addi rLEN,rLEN,-32 | |
200 | std rCHR,0(rMEMP) | |
201 | std rCHR,8(rMEMP) | |
202 | std rCHR,16(rMEMP) | |
04067002 | 203 | addi rMEMP,rMEMP,32 |
a88f47a7 UD |
204 | andi. rTMP,rMEMP3,127 |
205 | std rCHR,-8(rMEMP3) | |
206 | L(getCacheAligned2): | |
207 | beq L(cacheAligned) | |
04067002 | 208 | addi rLEN,rLEN,-32 |
a88f47a7 UD |
209 | std rCHR,0(rMEMP3) |
210 | std rCHR,8(rMEMP3) | |
211 | addi rMEMP,rMEMP,32 | |
212 | andi. rTMP,rMEMP,127 | |
213 | std rCHR,16(rMEMP3) | |
214 | std rCHR,24(rMEMP3) | |
215 | L(getCacheAligned3): | |
216 | beq L(cacheAligned) | |
217 | addi rLEN,rLEN,-32 | |
218 | std rCHR,32(rMEMP3) | |
219 | addi rMEMP,rMEMP,32 | |
220 | cmpldi cr1,rLEN,128 | |
221 | std rCHR,40(rMEMP3) | |
222 | cmpldi cr6,rLEN,256 | |
223 | li rMEMP2,128 | |
224 | std rCHR,48(rMEMP3) | |
225 | std rCHR,56(rMEMP3) | |
226 | blt cr1,L(cacheAligned1) | |
227 | blt cr6,L(cacheAligned128) | |
228 | b L(cacheAlignedx) | |
04067002 UD |
229 | |
230 | /* Now we are aligned to the cache line and can use dcbz. */ | |
a88f47a7 | 231 | .align 5 |
04067002 | 232 | L(cacheAligned): |
a88f47a7 UD |
233 | cmpldi cr1,rLEN,128 |
234 | cmpldi cr6,rLEN,256 | |
235 | blt cr1,L(cacheAligned1) | |
236 | li rMEMP2,128 | |
237 | L(cacheAlignedx): | |
238 | cmpldi cr5,rLEN,640 | |
239 | blt cr6,L(cacheAligned128) | |
240 | bgt cr5,L(cacheAligned512) | |
241 | cmpldi cr6,rLEN,512 | |
242 | dcbz 0,rMEMP | |
243 | cmpldi cr1,rLEN,384 | |
244 | dcbz rMEMP2,rMEMP | |
245 | addi rMEMP,rMEMP,256 | |
246 | addi rLEN,rLEN,-256 | |
247 | blt cr1,L(cacheAligned1) | |
248 | blt cr6,L(cacheAligned128) | |
249 | b L(cacheAligned256) | |
250 | .align 5 | |
251 | /* A simple loop for the longer (>640 bytes) lengths. This form limits | |
252 | the branch miss-predicted to exactly 1 at loop exit.*/ | |
253 | L(cacheAligned512): | |
254 | cmpli cr1,rLEN,128 | |
255 | blt cr1,L(cacheAligned1) | |
256 | dcbz 0,rMEMP | |
257 | addi rLEN,rLEN,-128 | |
258 | addi rMEMP,rMEMP,128 | |
259 | b L(cacheAligned512) | |
260 | .align 5 | |
261 | L(cacheAligned256): | |
262 | ||
263 | cmpldi cr6,rLEN,512 | |
264 | ||
04067002 | 265 | dcbz 0,rMEMP |
a88f47a7 UD |
266 | cmpldi cr1,rLEN,384 |
267 | dcbz rMEMP2,rMEMP | |
268 | addi rMEMP,rMEMP,256 | |
269 | addi rLEN,rLEN,-256 | |
270 | ||
271 | bge cr6,L(cacheAligned256) | |
272 | ||
273 | blt cr1,L(cacheAligned1) | |
274 | .align 4 | |
275 | L(cacheAligned128): | |
276 | dcbz 0,rMEMP | |
277 | addi rMEMP,rMEMP,128 | |
278 | addi rLEN,rLEN,-128 | |
279 | nop | |
280 | L(cacheAligned1): | |
281 | cmpldi cr1,rLEN,32 | |
282 | blt cr1,L(handletail32) | |
283 | addi rMEMP3,rMEMP,32 | |
284 | addi rLEN,rLEN,-32 | |
285 | std rCHR,0(rMEMP) | |
286 | std rCHR,8(rMEMP) | |
287 | std rCHR,16(rMEMP) | |
288 | addi rMEMP,rMEMP,32 | |
289 | cmpldi cr1,rLEN,32 | |
290 | std rCHR,-8(rMEMP3) | |
291 | L(cacheAligned2): | |
292 | blt cr1,L(handletail32) | |
293 | addi rLEN,rLEN,-32 | |
294 | std rCHR,0(rMEMP3) | |
295 | std rCHR,8(rMEMP3) | |
296 | addi rMEMP,rMEMP,32 | |
297 | cmpldi cr1,rLEN,32 | |
298 | std rCHR,16(rMEMP3) | |
299 | std rCHR,24(rMEMP3) | |
300 | nop | |
301 | L(cacheAligned3): | |
302 | blt cr1,L(handletail32) | |
303 | addi rMEMP,rMEMP,32 | |
304 | addi rLEN,rLEN,-32 | |
305 | std rCHR,32(rMEMP3) | |
306 | std rCHR,40(rMEMP3) | |
307 | std rCHR,48(rMEMP3) | |
308 | std rCHR,56(rMEMP3) | |
04067002 | 309 | |
a88f47a7 UD |
310 | /* We are here because the length or remainder (rLEN) is less than the |
311 | cache line/sector size and does not justify aggressive loop unrolling. | |
312 | So set up the preconditions for L(medium) and go there. */ | |
04067002 UD |
313 | .align 3 |
314 | L(handletail32): | |
a88f47a7 UD |
315 | cmpldi cr1,rLEN,0 |
316 | beqlr cr1 | |
317 | b L(medium) | |
04067002 UD |
318 | |
319 | .align 5 | |
320 | L(small): | |
321 | /* Memset of 8 bytes or less. */ | |
322 | cmpldi cr6, rLEN, 4 | |
323 | cmpldi cr5, rLEN, 1 | |
324 | ble cr6,L(le4) | |
325 | subi rLEN, rLEN, 4 | |
326 | stb rCHR,0(rMEMP) | |
327 | stb rCHR,1(rMEMP) | |
328 | stb rCHR,2(rMEMP) | |
329 | stb rCHR,3(rMEMP) | |
330 | addi rMEMP,rMEMP, 4 | |
331 | cmpldi cr5, rLEN, 1 | |
332 | L(le4): | |
333 | cmpldi cr1, rLEN, 3 | |
334 | bltlr cr5 | |
335 | stb rCHR, 0(rMEMP) | |
336 | beqlr cr5 | |
337 | stb rCHR, 1(rMEMP) | |
338 | bltlr cr1 | |
339 | stb rCHR, 2(rMEMP) | |
340 | beqlr cr1 | |
341 | stb rCHR, 3(rMEMP) | |
342 | blr | |
343 | ||
344 | /* Memset of 0-31 bytes. */ | |
345 | .align 5 | |
346 | L(medium): | |
3be87c77 | 347 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
04067002 UD |
348 | cmpldi cr1, rLEN, 16 |
349 | L(medium_tail2): | |
350 | add rMEMP, rMEMP, rLEN | |
351 | L(medium_tail): | |
352 | bt- 31, L(medium_31t) | |
353 | bt- 30, L(medium_30t) | |
354 | L(medium_30f): | |
a88f47a7 | 355 | bt 29, L(medium_29t) |
04067002 | 356 | L(medium_29f): |
a88f47a7 UD |
357 | bge cr1, L(medium_27t) |
358 | bflr 28 | |
04067002 UD |
359 | std rCHR, -8(rMEMP) |
360 | blr | |
361 | ||
362 | L(medium_31t): | |
363 | stbu rCHR, -1(rMEMP) | |
364 | bf- 30, L(medium_30f) | |
365 | L(medium_30t): | |
366 | sthu rCHR, -2(rMEMP) | |
367 | bf- 29, L(medium_29f) | |
368 | L(medium_29t): | |
369 | stwu rCHR, -4(rMEMP) | |
a88f47a7 | 370 | blt cr1, L(medium_27f) |
04067002 UD |
371 | L(medium_27t): |
372 | std rCHR, -8(rMEMP) | |
373 | stdu rCHR, -16(rMEMP) | |
374 | L(medium_27f): | |
a88f47a7 | 375 | bflr 28 |
04067002 UD |
376 | L(medium_28t): |
377 | std rCHR, -8(rMEMP) | |
378 | blr | |
2d67d91a | 379 | END_GEN_TB (memset,TB_TOCLESS) |
04067002 UD |
380 | libc_hidden_builtin_def (memset) |
381 | ||
382 | /* Copied from bzero.S to prevent the linker from inserting a stub | |
383 | between bzero and memset. */ | |
2d67d91a | 384 | ENTRY (__bzero) |
04067002 | 385 | CALL_MCOUNT 3 |
04067002 UD |
386 | mr r5,r4 |
387 | li r4,0 | |
388 | b L(_memset) | |
3b473fec AZ |
389 | END (__bzero) |
390 | #ifndef __bzero | |
2d67d91a | 391 | weak_alias (__bzero, bzero) |
8a29a3d0 | 392 | #endif |