]>
Commit | Line | Data |
---|---|---|
e6a1c5dc | 1 | /* Set a block of memory to some byte value. For SUN4V M7. |
d614a753 | 2 | Copyright (C) 2017-2020 Free Software Foundation, Inc. |
e6a1c5dc PM |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
e6a1c5dc PM |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | #ifndef XCC | |
22 | # define XCC xcc | |
23 | #endif | |
24 | .register %g2, #scratch | |
25 | .register %g3, #scratch | |
26 | ||
27 | /* The algorithm is as follows : | |
28 | * | |
29 | * For small 7 or fewer bytes stores, bytes will be stored. | |
30 | * | |
31 | * For less than 32 bytes stores, align the address on 4 byte boundary. | |
32 | * Then store as many 4-byte chunks, followed by trailing bytes. | |
33 | * | |
34 | * For sizes greater than 32 bytes, align the address on 8 byte boundary. | |
35 | * if (count >= 64) { | |
36 | * store 8-bytes chunks to align the address on 64 byte boundary | |
37 | * if (value to be set is zero && count >= MIN_ZERO) { | |
38 | * Using BIS stores, set the first long word of each | |
39 | * 64-byte cache line to zero which will also clear the | |
40 | * other seven long words of the cache line. | |
41 | * } | |
42 | * else if (count >= MIN_LOOP) { | |
43 | * Using BIS stores, set the first long word of each of | |
44 | * ST_CHUNK cache lines (64 bytes each) before the main | |
45 | * loop is entered. | |
46 | * In the main loop, continue pre-setting the first long | |
47 | * word of each cache line ST_CHUNK lines in advance while | |
48 | * setting the other seven long words (56 bytes) of each | |
49 | * cache line until fewer than ST_CHUNK*64 bytes remain. | |
50 | * Then set the remaining seven long words of each cache | |
51 | * line that has already had its first long word set. | |
52 | * } | |
53 | * store remaining data in 64-byte chunks until less than | |
54 | * 64 bytes remain. | |
55 | * } | |
56 | * Store as many 8-byte chunks, followed by trailing bytes. | |
57 | * | |
58 | * | |
59 | * BIS = Block Init Store | |
60 | * Doing the advance store of the first element of the cache line | |
61 | * initiates the displacement of a cache line while only using a single | |
62 | * instruction in the pipeline. That avoids various pipeline delays, | |
63 | * such as filling the miss buffer. The performance effect is | |
64 | * similar to prefetching for normal stores. | |
65 | * The special case for zero fills runs faster and uses fewer instruction | |
66 | * cycles than the normal memset loop. | |
67 | * | |
68 | * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence | |
69 | * BIS stores must be followed by a membar #StoreStore. The benefit of | |
70 | * the BIS store must be balanced against the cost of the membar operation. | |
71 | */ | |
72 | ||
73 | /* | |
74 | * ASI_STBI_P marks the cache line as "least recently used" | |
75 | * which means if many threads are active, it has a high chance | |
76 | * of being pushed out of the cache between the first initializing | |
77 | * store and the final stores. | |
78 | * Thus, we use ASI_STBIMRU_P which marks the cache line as | |
79 | * "most recently used" for all but the last store to the cache line. | |
80 | */ | |
81 | ||
82 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
83 | #define ASI_ST_BLK_INIT_MRU_P 0xf2 | |
84 | ||
85 | #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P | |
86 | #define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P | |
87 | ||
88 | #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ | |
89 | #define MIN_LOOP (ST_CHUNK)*64 | |
90 | #define MIN_ZERO 256 | |
91 | ||
92 | #define EX_ST(x) x | |
93 | #define EX_RETVAL(x) x | |
94 | #define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P | |
95 | #define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P | |
96 | ||
97 | #if IS_IN (libc) | |
98 | ||
99 | .text | |
100 | .align 32 | |
101 | ||
102 | ENTRY(__bzero_niagara7) | |
103 | /* bzero (dst, size) */ | |
104 | mov %o1, %o2 | |
105 | mov 0, %o1 | |
106 | /* fall through into memset code */ | |
107 | END(__bzero_niagara7) | |
108 | ||
109 | ENTRY(__memset_niagara7) | |
110 | /* memset (src, c, size) */ | |
111 | mov %o0, %o5 /* copy sp1 before using it */ | |
112 | cmp %o2, 7 /* if small counts, just write bytes */ | |
113 | bleu,pn %XCC, .Lwrchar | |
114 | and %o1, 0xff, %o1 /* o1 is (char)c */ | |
115 | ||
116 | sll %o1, 8, %o3 | |
117 | or %o1, %o3, %o1 /* now o1 has 2 bytes of c */ | |
118 | sll %o1, 16, %o3 | |
119 | cmp %o2, 32 | |
120 | blu,pn %XCC, .Lwdalign | |
121 | or %o1, %o3, %o1 /* now o1 has 4 bytes of c */ | |
122 | ||
123 | sllx %o1, 32, %o3 | |
124 | or %o1, %o3, %o1 /* now o1 has 8 bytes of c */ | |
125 | ||
126 | .Ldbalign: | |
127 | andcc %o5, 7, %o3 /* is sp1 aligned on a 8 byte bound? */ | |
128 | bz,pt %XCC, .Lblkalign /* already long word aligned */ | |
129 | sub %o3, 8, %o3 /* -(bytes till long word aligned) */ | |
130 | ||
131 | add %o2, %o3, %o2 /* update o2 with new count */ | |
132 | /* Set -(%o3) bytes till sp1 long word aligned */ | |
133 | 1: stb %o1, [%o5] /* there is at least 1 byte to set */ | |
134 | inccc %o3 /* byte clearing loop */ | |
135 | bl,pt %XCC, 1b | |
136 | inc %o5 | |
137 | ||
138 | /* Now sp1 is long word aligned (sp1 is found in %o5) */ | |
139 | .Lblkalign: | |
140 | cmp %o2, 64 /* check if there are 64 bytes to set */ | |
141 | blu,pn %XCC, .Lwrshort | |
142 | mov %o2, %o3 | |
143 | ||
144 | andcc %o5, 63, %o3 /* is sp1 block aligned? */ | |
145 | bz,pt %XCC, .Lblkwr /* now block aligned */ | |
146 | sub %o3, 64, %o3 /* o3 is -(bytes till block aligned) */ | |
147 | add %o2, %o3, %o2 /* o2 is the remainder */ | |
148 | ||
149 | /* Store -(%o3) bytes till dst is block (64 byte) aligned. */ | |
150 | /* Use long word stores. */ | |
151 | /* Recall that dst is already long word aligned */ | |
152 | 1: | |
153 | addcc %o3, 8, %o3 | |
154 | stx %o1, [%o5] | |
155 | bl,pt %XCC, 1b | |
156 | add %o5, 8, %o5 | |
157 | ||
158 | /* Now sp1 is block aligned */ | |
159 | .Lblkwr: | |
160 | andn %o2, 63, %o4 /* calculate size of blocks in bytes */ | |
161 | brz,pn %o1, .Lwrzero /* special case if c == 0 */ | |
162 | and %o2, 63, %o3 /* %o3 = bytes left after blk stores */ | |
163 | ||
164 | cmp %o4, MIN_LOOP /* check for enough bytes to set */ | |
165 | blu,pn %XCC, .Lshort_set /* to justify cost of membar */ | |
166 | nop /* must be > pre-cleared lines */ | |
167 | ||
168 | /* initial cache-clearing stores */ | |
169 | /* get store pipeline moving */ | |
170 | ||
171 | /* Primary memset loop for large memsets */ | |
172 | .Lwr_loop: | |
173 | mov ST_CHUNK, %g1 | |
174 | .Lwr_loop_start: | |
175 | subcc %g1, 4, %g1 | |
176 | EX_ST(STORE_ASI(%o1,%o5)) | |
177 | add %o5, 64, %o5 | |
178 | EX_ST(STORE_ASI(%o1,%o5)) | |
179 | add %o5, 64, %o5 | |
180 | EX_ST(STORE_ASI(%o1,%o5)) | |
181 | add %o5, 64, %o5 | |
182 | EX_ST(STORE_ASI(%o1,%o5)) | |
183 | bgu %XCC, .Lwr_loop_start | |
184 | add %o5, 64, %o5 | |
185 | ||
186 | sub %o5, ST_CHUNK*64, %o5 /* reset %o5 */ | |
187 | mov ST_CHUNK, %g1 | |
188 | sub %o5, 8, %o5 /* adjust %o5 for ASI store */ | |
189 | ||
190 | .Lwr_loop_rest: | |
191 | stx %o1,[%o5+8+8] | |
192 | sub %o4, 64, %o4 | |
193 | stx %o1,[%o5+16+8] | |
194 | subcc %g1, 1, %g1 | |
195 | stx %o1,[%o5+24+8] | |
196 | stx %o1,[%o5+32+8] | |
197 | stx %o1,[%o5+40+8] | |
198 | add %o5, 64, %o5 | |
199 | stx %o1,[%o5-8] | |
200 | bgu %XCC, .Lwr_loop_rest | |
201 | EX_ST(STORE_INIT(%o1,%o5)) | |
202 | ||
203 | add %o5, 8, %o5 /* restore %o5 offset */ | |
204 | ||
205 | /* If more than ST_CHUNK*64 bytes remain to set, continue */ | |
206 | /* setting the first long word of each cache line in advance */ | |
207 | /* to keep the store pipeline moving. */ | |
208 | ||
209 | cmp %o4, ST_CHUNK*64 | |
210 | bge,pt %XCC, .Lwr_loop_start | |
211 | mov ST_CHUNK, %g1 | |
212 | ||
213 | brz,a,pn %o4, .Lasi_done | |
214 | nop | |
215 | ||
216 | sub %o5, 8, %o5 /* adjust %o5 for ASI store */ | |
217 | .Lwr_loop_small: | |
218 | add %o5, 8, %o5 /* adjust %o5 for ASI store */ | |
219 | EX_ST(STORE_ASI(%o1,%o5)) | |
220 | stx %o1,[%o5+8] | |
221 | stx %o1,[%o5+16] | |
222 | stx %o1,[%o5+24] | |
223 | stx %o1,[%o5+32] | |
224 | subcc %o4, 64, %o4 | |
225 | stx %o1,[%o5+40] | |
226 | add %o5, 56, %o5 | |
227 | stx %o1,[%o5-8] | |
228 | bgu,pt %XCC, .Lwr_loop_small | |
229 | EX_ST(STORE_INIT(%o1,%o5)) | |
230 | ||
231 | ba .Lasi_done | |
232 | add %o5, 8, %o5 /* restore %o5 offset */ | |
233 | ||
234 | /* Special case loop for zero fill memsets */ | |
235 | /* For each 64 byte cache line, single STBI to first element */ | |
236 | /* clears line */ | |
237 | .Lwrzero: | |
238 | cmp %o4, MIN_ZERO /* check if enough bytes to set */ | |
239 | /* to pay %asi + membar cost */ | |
240 | blu %XCC, .Lshort_set | |
241 | nop | |
242 | sub %o4, 256, %o4 | |
243 | ||
244 | .Lwrzero_loop: | |
245 | mov 64, %g3 | |
246 | EX_ST(STORE_INIT(%o1,%o5)) | |
247 | subcc %o4, 256, %o4 | |
248 | EX_ST(STORE_INIT(%o1,%o5+%g3)) | |
249 | add %o5, 256, %o5 | |
250 | sub %g3, 192, %g3 | |
251 | EX_ST(STORE_INIT(%o1,%o5+%g3)) | |
252 | add %g3, 64, %g3 | |
253 | bge,pt %XCC, .Lwrzero_loop | |
254 | EX_ST(STORE_INIT(%o1,%o5+%g3)) | |
255 | add %o4, 256, %o4 | |
256 | ||
257 | brz,pn %o4, .Lbsi_done | |
258 | nop | |
259 | .Lwrzero_small: | |
260 | EX_ST(STORE_INIT(%o1,%o5)) | |
261 | subcc %o4, 64, %o4 | |
262 | bgu,pt %XCC, .Lwrzero_small | |
263 | add %o5, 64, %o5 | |
264 | ||
265 | .Lasi_done: | |
266 | .Lbsi_done: | |
267 | membar #StoreStore /* required by use of BSI */ | |
268 | ||
269 | .Lshort_set: | |
270 | cmp %o4, 64 /* check if 64 bytes to set */ | |
271 | blu %XCC, 5f | |
272 | nop | |
273 | 4: /* set final blocks of 64 bytes */ | |
274 | stx %o1, [%o5] | |
275 | stx %o1, [%o5+8] | |
276 | stx %o1, [%o5+16] | |
277 | stx %o1, [%o5+24] | |
278 | subcc %o4, 64, %o4 | |
279 | stx %o1, [%o5+32] | |
280 | stx %o1, [%o5+40] | |
281 | add %o5, 64, %o5 | |
282 | stx %o1, [%o5-16] | |
283 | bgu,pt %XCC, 4b | |
284 | stx %o1, [%o5-8] | |
285 | ||
286 | 5: | |
287 | /* Set the remaining long words */ | |
288 | .Lwrshort: | |
289 | subcc %o3, 8, %o3 /* Can we store any long words? */ | |
290 | blu,pn %XCC, .Lwrchars | |
291 | and %o2, 7, %o2 /* calc bytes left after long words */ | |
292 | 6: | |
293 | subcc %o3, 8, %o3 | |
294 | stx %o1, [%o5] /* store the long words */ | |
295 | bgeu,pt %XCC, 6b | |
296 | add %o5, 8, %o5 | |
297 | ||
298 | .Lwrchars: /* check for extra chars */ | |
299 | brnz %o2, .Lwrfin | |
300 | nop | |
301 | retl | |
302 | nop | |
303 | ||
304 | .Lwdalign: | |
305 | andcc %o5, 3, %o3 /* is sp1 aligned on a word boundary */ | |
306 | bz,pn %XCC, .Lwrword | |
307 | andn %o2, 3, %o3 /* create word sized count in %o3 */ | |
308 | ||
309 | dec %o2 /* decrement count */ | |
310 | stb %o1, [%o5] /* clear a byte */ | |
311 | b .Lwdalign | |
312 | inc %o5 /* next byte */ | |
313 | ||
314 | .Lwrword: | |
315 | subcc %o3, 4, %o3 | |
316 | st %o1, [%o5] /* 4-byte writing loop */ | |
317 | bnz,pt %XCC, .Lwrword | |
318 | add %o5, 4, %o5 | |
319 | and %o2, 3, %o2 /* leftover count, if any */ | |
320 | ||
321 | .Lwrchar: | |
322 | /* Set the remaining bytes, if any */ | |
323 | brz %o2, .Lexit | |
324 | nop | |
325 | .Lwrfin: | |
326 | deccc %o2 | |
327 | stb %o1, [%o5] | |
328 | bgu,pt %XCC, .Lwrfin | |
329 | inc %o5 | |
330 | .Lexit: | |
331 | retl /* %o0 was preserved */ | |
332 | nop | |
333 | END(__memset_niagara7) | |
334 | #endif |