]>
Commit | Line | Data |
---|---|---|
9a0a462c | 1 | /* Optimized memset implementation for PowerPC. |
bfff8b1b | 2 | Copyright (C) 1997-2017 Free Software Foundation, Inc. |
9a0a462c UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9a0a462c UD |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 13 | Lesser General Public License for more details. |
9a0a462c | 14 | |
41bdb6e2 | 15 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
9a0a462c UD |
18 | |
19 | #include <sysdep.h> | |
20 | ||
f17a4233 | 21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
9a0a462c UD |
22 | Returns 's'. |
23 | ||
b8a5737a GK |
24 | The memset is done in four sizes: byte (8 bits), word (32 bits), |
25 | 32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits). | |
124dcac8 | 26 | There is a special case for setting whole cache lines to 0, which |
b8a5737a | 27 | takes advantage of the dcbz instruction. */ |
1d280d9f | 28 | |
124dcac8 | 29 | .section ".text" |
b5510883 | 30 | EALIGN (memset, 5, 1) |
1d280d9f GM |
31 | |
32 | #define rTMP r0 | |
fa87f403 | 33 | #define rRTN r3 /* initial value of 1st argument */ |
b5510883 JM |
34 | #define rMEMP0 r3 /* original value of 1st arg */ |
35 | #define rCHR r4 /* char to set in each byte */ | |
36 | #define rLEN r5 /* length of region to set */ | |
37 | #define rMEMP r6 /* address at which we are storing */ | |
1d280d9f GM |
38 | #define rALIGN r7 /* number of bytes we are setting now (when aligning) */ |
39 | #define rMEMP2 r8 | |
40 | ||
41 | #define rPOS32 r7 /* constant +32 for clearing with dcbz */ | |
42 | #define rNEG64 r8 /* constant -64 for clearing with dcbz */ | |
43 | #define rNEG32 r9 /* constant -32 for clearing with dcbz */ | |
9a0a462c | 44 | |
b8a5737a GK |
45 | #define rGOT r9 /* Address of the Global Offset Table. */ |
46 | #define rCLS r8 /* Cache line size obtained from static. */ | |
47 | #define rCLM r9 /* Cache line size mask to check for cache alignment. */ | |
48 | ||
9a0a462c | 49 | /* take care of case for size <= 4 */ |
1d280d9f | 50 | cmplwi cr1, rLEN, 4 |
fa87f403 GM |
51 | andi. rALIGN, rMEMP0, 3 |
52 | mr rMEMP, rMEMP0 | |
1d280d9f | 53 | ble- cr1, L(small) |
9a0a462c | 54 | /* align to word boundary */ |
1d280d9f GM |
55 | cmplwi cr5, rLEN, 31 |
56 | rlwimi rCHR, rCHR, 8, 16, 23 | |
57 | beq+ L(aligned) /* 8th instruction from .align */ | |
fa87f403 | 58 | mtcrf 0x01, rMEMP0 |
1d280d9f GM |
59 | subfic rALIGN, rALIGN, 4 |
60 | add rMEMP, rMEMP, rALIGN | |
61 | sub rLEN, rLEN, rALIGN | |
62 | bf+ 31, L(g0) | |
fa87f403 | 63 | stb rCHR, 0(rMEMP0) |
1d280d9f GM |
64 | bt 30, L(aligned) |
65 | L(g0): sth rCHR, -2(rMEMP) /* 16th instruction from .align */ | |
9a0a462c UD |
66 | /* take care of case for size < 31 */ |
67 | L(aligned): | |
1d280d9f GM |
68 | mtcrf 0x01, rLEN |
69 | rlwimi rCHR, rCHR, 16, 0, 15 | |
70 | ble cr5, L(medium) | |
9a0a462c | 71 | /* align to cache line boundary... */ |
1d280d9f GM |
72 | andi. rALIGN, rMEMP, 0x1C |
73 | subfic rALIGN, rALIGN, 0x20 | |
74 | beq L(caligned) | |
75 | mtcrf 0x01, rALIGN | |
76 | add rMEMP, rMEMP, rALIGN | |
77 | sub rLEN, rLEN, rALIGN | |
78 | cmplwi cr1, rALIGN, 0x10 | |
79 | mr rMEMP2, rMEMP | |
80 | bf 28, L(a1) | |
81 | stw rCHR, -4(rMEMP2) | |
82 | stwu rCHR, -8(rMEMP2) | |
83 | L(a1): blt cr1, L(a2) | |
84 | stw rCHR, -4(rMEMP2) /* 32nd instruction from .align */ | |
85 | stw rCHR, -8(rMEMP2) | |
86 | stw rCHR, -12(rMEMP2) | |
87 | stwu rCHR, -16(rMEMP2) | |
88 | L(a2): bf 29, L(caligned) | |
89 | stw rCHR, -4(rMEMP2) | |
9a0a462c UD |
90 | /* now aligned to a cache line. */ |
91 | L(caligned): | |
1d280d9f GM |
92 | cmplwi cr1, rCHR, 0 |
93 | clrrwi. rALIGN, rLEN, 5 | |
94 | mtcrf 0x01, rLEN /* 40th instruction from .align */ | |
124dcac8 | 95 | |
b8a5737a | 96 | /* Check if we can use the special case for clearing memory using dcbz. |
124dcac8 | 97 | This requires that we know the correct cache line size for this |
b8a5737a GK |
98 | processor. Getting the __cache_line_size may require establishing GOT |
99 | addressability, so branch out of line to set this up. */ | |
124dcac8 RM |
100 | beq cr1, L(checklinesize) |
101 | ||
102 | /* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary. | |
b8a5737a GK |
103 | Can't assume that rCHR is zero or that the cache line size is either |
104 | 32-bytes or even known. */ | |
105 | L(nondcbz): | |
1d280d9f GM |
106 | srwi rTMP, rALIGN, 5 |
107 | mtctr rTMP | |
108 | beq L(medium) /* we may not actually get to do a full line */ | |
109 | clrlwi. rLEN, rLEN, 27 | |
110 | add rMEMP, rMEMP, rALIGN | |
111 | li rNEG64, -0x40 | |
112 | bdz L(cloopdone) /* 48th instruction from .align */ | |
9a0a462c | 113 | |
b8a5737a GK |
114 | /* We can't use dcbz here as we don't know the cache line size. We can |
115 | use "data cache block touch for store", which is safe. */ | |
f7d78e18 | 116 | L(c3): dcbtst rNEG64, rMEMP |
1d280d9f GM |
117 | stw rCHR, -4(rMEMP) |
118 | stw rCHR, -8(rMEMP) | |
119 | stw rCHR, -12(rMEMP) | |
120 | stw rCHR, -16(rMEMP) | |
121 | nop /* let 601 fetch last 4 instructions of loop */ | |
122 | stw rCHR, -20(rMEMP) | |
123 | stw rCHR, -24(rMEMP) /* 56th instruction from .align */ | |
124 | nop /* let 601 fetch first 8 instructions of loop */ | |
125 | stw rCHR, -28(rMEMP) | |
126 | stwu rCHR, -32(rMEMP) | |
127 | bdnz L(c3) | |
9a0a462c | 128 | L(cloopdone): |
1d280d9f GM |
129 | stw rCHR, -4(rMEMP) |
130 | stw rCHR, -8(rMEMP) | |
131 | stw rCHR, -12(rMEMP) | |
132 | stw rCHR, -16(rMEMP) /* 64th instruction from .align */ | |
133 | stw rCHR, -20(rMEMP) | |
134 | cmplwi cr1, rLEN, 16 | |
135 | stw rCHR, -24(rMEMP) | |
136 | stw rCHR, -28(rMEMP) | |
137 | stwu rCHR, -32(rMEMP) | |
9a0a462c | 138 | beqlr |
1d280d9f GM |
139 | add rMEMP, rMEMP, rALIGN |
140 | b L(medium_tail2) /* 72nd instruction from .align */ | |
9a0a462c | 141 | |
f7d78e18 | 142 | .align 5 |
9a0a462c | 143 | nop |
124dcac8 | 144 | /* Clear cache lines of memory in 128-byte chunks. |
b8a5737a GK |
145 | This code is optimized for processors with 32-byte cache lines. |
146 | It is further optimized for the 601 processor, which requires | |
147 | some care in how the code is aligned in the i-cache. */ | |
9a0a462c | 148 | L(zloopstart): |
1d280d9f GM |
149 | clrlwi rLEN, rLEN, 27 |
150 | mtcrf 0x02, rALIGN | |
151 | srwi. rTMP, rALIGN, 7 | |
152 | mtctr rTMP | |
153 | li rPOS32, 0x20 | |
154 | li rNEG64, -0x40 | |
155 | cmplwi cr1, rLEN, 16 /* 8 */ | |
156 | bf 26, L(z0) | |
157 | dcbz 0, rMEMP | |
158 | addi rMEMP, rMEMP, 0x20 | |
159 | L(z0): li rNEG32, -0x20 | |
160 | bf 25, L(z1) | |
161 | dcbz 0, rMEMP | |
162 | dcbz rPOS32, rMEMP | |
163 | addi rMEMP, rMEMP, 0x40 /* 16 */ | |
164 | L(z1): cmplwi cr5, rLEN, 0 | |
165 | beq L(medium) | |
9a0a462c | 166 | L(zloop): |
1d280d9f GM |
167 | dcbz 0, rMEMP |
168 | dcbz rPOS32, rMEMP | |
169 | addi rMEMP, rMEMP, 0x80 | |
170 | dcbz rNEG64, rMEMP | |
171 | dcbz rNEG32, rMEMP | |
172 | bdnz L(zloop) | |
173 | beqlr cr5 | |
174 | b L(medium_tail2) | |
9a0a462c | 175 | |
f7d78e18 | 176 | .align 5 |
9a0a462c UD |
177 | L(small): |
178 | /* Memset of 4 bytes or less. */ | |
1d280d9f GM |
179 | cmplwi cr5, rLEN, 1 |
180 | cmplwi cr1, rLEN, 3 | |
181 | bltlr cr5 | |
182 | stb rCHR, 0(rMEMP) | |
183 | beqlr cr5 | |
9a0a462c | 184 | nop |
1d280d9f GM |
185 | stb rCHR, 1(rMEMP) |
186 | bltlr cr1 | |
187 | stb rCHR, 2(rMEMP) | |
188 | beqlr cr1 | |
9a0a462c | 189 | nop |
1d280d9f | 190 | stb rCHR, 3(rMEMP) |
9a0a462c UD |
191 | blr |
192 | ||
193 | /* Memset of 0-31 bytes. */ | |
f7d78e18 | 194 | .align 5 |
9a0a462c | 195 | L(medium): |
1d280d9f | 196 | cmplwi cr1, rLEN, 16 |
9a0a462c | 197 | L(medium_tail2): |
1d280d9f | 198 | add rMEMP, rMEMP, rLEN |
9a0a462c | 199 | L(medium_tail): |
1d280d9f GM |
200 | bt- 31, L(medium_31t) |
201 | bt- 30, L(medium_30t) | |
9a0a462c | 202 | L(medium_30f): |
1d280d9f | 203 | bt- 29, L(medium_29t) |
9a0a462c | 204 | L(medium_29f): |
1d280d9f GM |
205 | bge- cr1, L(medium_27t) |
206 | bflr- 28 | |
207 | stw rCHR, -4(rMEMP) /* 8th instruction from .align */ | |
208 | stw rCHR, -8(rMEMP) | |
9a0a462c UD |
209 | blr |
210 | ||
211 | L(medium_31t): | |
1d280d9f GM |
212 | stbu rCHR, -1(rMEMP) |
213 | bf- 30, L(medium_30f) | |
9a0a462c | 214 | L(medium_30t): |
1d280d9f GM |
215 | sthu rCHR, -2(rMEMP) |
216 | bf- 29, L(medium_29f) | |
9a0a462c | 217 | L(medium_29t): |
1d280d9f GM |
218 | stwu rCHR, -4(rMEMP) |
219 | blt- cr1, L(medium_27f) /* 16th instruction from .align */ | |
9a0a462c | 220 | L(medium_27t): |
1d280d9f GM |
221 | stw rCHR, -4(rMEMP) |
222 | stw rCHR, -8(rMEMP) | |
223 | stw rCHR, -12(rMEMP) | |
224 | stwu rCHR, -16(rMEMP) | |
9a0a462c | 225 | L(medium_27f): |
1d280d9f | 226 | bflr- 28 |
9a0a462c | 227 | L(medium_28t): |
1d280d9f GM |
228 | stw rCHR, -4(rMEMP) |
229 | stw rCHR, -8(rMEMP) | |
9a0a462c | 230 | blr |
124dcac8 | 231 | |
b8a5737a GK |
232 | L(checklinesize): |
233 | #ifdef SHARED | |
f7d78e18 | 234 | mflr rTMP |
b8a5737a | 235 | /* If the remaining length is less the 32 bytes then don't bother getting |
f7d78e18 | 236 | the cache line size. */ |
124dcac8 RM |
237 | beq L(medium) |
238 | /* Establishes GOT addressability so we can load __cache_line_size | |
b8a5737a | 239 | from static. This value was set from the aux vector during startup. */ |
91d2a845 WS |
240 | SETUP_GOT_ACCESS(rGOT,got_label) |
241 | addis rGOT,rGOT,__cache_line_size-got_label@ha | |
242 | lwz rCLS,__cache_line_size-got_label@l(rGOT) | |
f7d78e18 | 243 | mtlr rTMP |
124dcac8 RM |
244 | #else |
245 | /* Load __cache_line_size from static. This value was set from the | |
b8a5737a | 246 | aux vector during startup. */ |
f7d78e18 | 247 | lis rCLS,__cache_line_size@ha |
b8a5737a | 248 | /* If the remaining length is less the 32 bytes then don't bother getting |
f7d78e18 | 249 | the cache line size. */ |
b8a5737a | 250 | beq L(medium) |
f7d78e18 | 251 | lwz rCLS,__cache_line_size@l(rCLS) |
b8a5737a | 252 | #endif |
124dcac8 | 253 | |
f7d78e18 UD |
254 | /* If the cache line size was not set then goto to L(nondcbz), which is |
255 | safe for any cache line size. */ | |
256 | cmplwi cr1,rCLS,0 | |
b8a5737a | 257 | beq cr1,L(nondcbz) |
124dcac8 | 258 | |
b8a5737a | 259 | /* If the cache line size is 32 bytes then goto to L(zloopstart), |
2ccdea26 | 260 | which is coded specifically for 32-byte lines (and 601). */ |
f7d78e18 | 261 | cmplwi cr1,rCLS,32 |
b8a5737a | 262 | beq cr1,L(zloopstart) |
124dcac8 RM |
263 | |
264 | /* Now we know the cache line size and it is not 32-bytes. However | |
f7d78e18 UD |
265 | we may not yet be aligned to the cache line and may have a partial |
266 | line to fill. Touch it 1st to fetch the cache line. */ | |
267 | dcbtst 0,rMEMP | |
124dcac8 | 268 | |
f7d78e18 | 269 | addi rCLM,rCLS,-1 |
b8a5737a | 270 | L(getCacheAligned): |
f7d78e18 UD |
271 | cmplwi cr1,rLEN,32 |
272 | and. rTMP,rCLM,rMEMP | |
273 | blt cr1,L(handletail32) | |
274 | beq L(cacheAligned) | |
b8a5737a GK |
275 | /* We are not aligned to start of a cache line yet. Store 32-byte |
276 | of data and test again. */ | |
f7d78e18 UD |
277 | addi rMEMP,rMEMP,32 |
278 | addi rLEN,rLEN,-32 | |
279 | stw rCHR,-32(rMEMP) | |
280 | stw rCHR,-28(rMEMP) | |
281 | stw rCHR,-24(rMEMP) | |
282 | stw rCHR,-20(rMEMP) | |
283 | stw rCHR,-16(rMEMP) | |
284 | stw rCHR,-12(rMEMP) | |
285 | stw rCHR,-8(rMEMP) | |
286 | stw rCHR,-4(rMEMP) | |
287 | b L(getCacheAligned) | |
124dcac8 RM |
288 | |
289 | /* Now we are aligned to the cache line and can use dcbz. */ | |
b8a5737a | 290 | L(cacheAligned): |
f7d78e18 UD |
291 | cmplw cr1,rLEN,rCLS |
292 | blt cr1,L(handletail32) | |
293 | dcbz 0,rMEMP | |
294 | subf rLEN,rCLS,rLEN | |
295 | add rMEMP,rMEMP,rCLS | |
296 | b L(cacheAligned) | |
b8a5737a | 297 | |
124dcac8 RM |
298 | /* We are here because; the cache line size was set, it was not |
299 | 32-bytes, and the remainder (rLEN) is now less than the actual cache | |
300 | line size. Set up the preconditions for L(nondcbz) and go there to | |
301 | store the remaining bytes. */ | |
b8a5737a GK |
302 | L(handletail32): |
303 | clrrwi. rALIGN, rLEN, 5 | |
f7d78e18 | 304 | b L(nondcbz) |
124dcac8 | 305 | |
b5510883 | 306 | END (memset) |
85dd1003 | 307 | libc_hidden_builtin_def (memset) |