]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | /* Optimized version of the standard bzero() function. |
2 | This file is part of the GNU C Library. | |
b168057a | 3 | Copyright (C) 2000-2015 Free Software Foundation, Inc. |
d5efd131 MF |
4 | Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>. |
5 | Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch> | |
6 | ||
7 | The GNU C Library is free software; you can redistribute it and/or | |
8 | modify it under the terms of the GNU Lesser General Public | |
9 | License as published by the Free Software Foundation; either | |
10 | version 2.1 of the License, or (at your option) any later version. | |
11 | ||
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | Lesser General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU Lesser General Public | |
75efb018 MF |
18 | License along with the GNU C Library; if not, see |
19 | <http://www.gnu.org/licenses/>. */ | |
d5efd131 MF |
20 | |
21 | /* Return: dest | |
22 | ||
23 | Inputs: | |
24 | in0: dest | |
25 | in1: count | |
26 | ||
27 | The algorithm is fairly straightforward: set byte by byte until we | |
28 | we get to a 16B-aligned address, then loop on 128 B chunks using an | |
29 | early store as prefetching, then loop on 32B chucks, then clear remaining | |
30 | words, finally clear remaining bytes. | |
31 | Since a stf.spill f0 can store 16B in one go, we use this instruction | |
32 | to get peak speed. */ | |
33 | ||
34 | #include <sysdep.h> | |
35 | #undef ret | |
36 | ||
37 | #define dest in0 | |
38 | #define cnt in1 | |
39 | ||
40 | #define tmp r31 | |
41 | #define save_lc r30 | |
42 | #define ptr0 r29 | |
43 | #define ptr1 r28 | |
44 | #define ptr2 r27 | |
45 | #define ptr3 r26 | |
46 | #define ptr9 r24 | |
47 | #define loopcnt r23 | |
48 | #define linecnt r22 | |
49 | #define bytecnt r21 | |
50 | ||
51 | // This routine uses only scratch predicate registers (p6 - p15) | |
52 | #define p_scr p6 // default register for same-cycle branches | |
53 | #define p_unalgn p9 | |
54 | #define p_y p11 | |
55 | #define p_n p12 | |
56 | #define p_yy p13 | |
57 | #define p_nn p14 | |
58 | ||
59 | #define movi0 mov | |
60 | ||
61 | #define MIN1 15 | |
62 | #define MIN1P1HALF 8 | |
63 | #define LINE_SIZE 128 | |
64 | #define LSIZE_SH 7 // shift amount | |
65 | #define PREF_AHEAD 8 | |
66 | ||
67 | #define USE_FLP | |
68 | #if defined(USE_INT) | |
69 | #define store st8 | |
70 | #define myval r0 | |
71 | #elif defined(USE_FLP) | |
72 | #define store stf8 | |
73 | #define myval f0 | |
74 | #endif | |
75 | ||
76 | .align 64 | |
77 | ENTRY(bzero) | |
78 | { .mmi | |
79 | .prologue | |
80 | alloc tmp = ar.pfs, 2, 0, 0, 0 | |
81 | lfetch.nt1 [dest] | |
82 | .save ar.lc, save_lc | |
83 | movi0 save_lc = ar.lc | |
84 | } { .mmi | |
85 | .body | |
86 | mov ret0 = dest // return value | |
87 | nop.m 0 | |
88 | cmp.eq p_scr, p0 = cnt, r0 | |
89 | ;; } | |
90 | { .mmi | |
91 | and ptr2 = -(MIN1+1), dest // aligned address | |
92 | and tmp = MIN1, dest // prepare to check for alignment | |
93 | tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) | |
94 | } { .mib | |
95 | mov ptr1 = dest | |
96 | nop.i 0 | |
97 | (p_scr) br.ret.dpnt.many rp // return immediately if count = 0 | |
98 | ;; } | |
99 | { .mib | |
100 | cmp.ne p_unalgn, p0 = tmp, r0 | |
101 | } { .mib // NB: # of bytes to move is 1 | |
102 | sub bytecnt = (MIN1+1), tmp // higher than loopcnt | |
103 | cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? | |
104 | (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) | |
105 | ;; } | |
106 | { .mmi | |
107 | (p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment | |
108 | (p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment | |
109 | (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? | |
110 | ;; } | |
111 | { .mib | |
112 | (p_y) add cnt = -8, cnt | |
113 | (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? | |
114 | } { .mib | |
115 | (p_y) st8 [ptr2] = r0,-4 | |
116 | (p_n) add ptr2 = 4, ptr2 | |
117 | ;; } | |
118 | { .mib | |
119 | (p_yy) add cnt = -4, cnt | |
120 | (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? | |
121 | } { .mib | |
122 | (p_yy) st4 [ptr2] = r0,-2 | |
123 | (p_nn) add ptr2 = 2, ptr2 | |
124 | ;; } | |
125 | { .mmi | |
126 | mov tmp = LINE_SIZE+1 // for compare | |
127 | (p_y) add cnt = -2, cnt | |
128 | (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? | |
129 | } { .mmi | |
130 | nop.m 0 | |
131 | (p_y) st2 [ptr2] = r0,-1 | |
132 | (p_n) add ptr2 = 1, ptr2 | |
133 | ;; } | |
134 | ||
135 | { .mmi | |
136 | (p_yy) st1 [ptr2] = r0 | |
c70a4b1d | 137 | cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? |
d5efd131 MF |
138 | } { .mbb |
139 | (p_yy) add cnt = -1, cnt | |
140 | (p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few | |
141 | ;; } | |
142 | { .mib | |
143 | nop.m 0 | |
144 | shr.u linecnt = cnt, LSIZE_SH | |
145 | nop.b 0 | |
146 | ;; } | |
147 | ||
148 | .align 32 | |
149 | .l1b: // ------------------// L1B: store ahead into cache lines; fill later | |
150 | { .mmi | |
151 | and tmp = -(LINE_SIZE), cnt // compute end of range | |
152 | mov ptr9 = ptr1 // used for prefetching | |
153 | and cnt = (LINE_SIZE-1), cnt // remainder | |
154 | } { .mmi | |
155 | mov loopcnt = PREF_AHEAD-1 // default prefetch loop | |
156 | cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value | |
157 | ;; } | |
158 | { .mmi | |
159 | (p_scr) add loopcnt = -1, linecnt | |
160 | add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) | |
161 | add ptr1 = tmp, ptr1 // first address beyond total range | |
162 | ;; } | |
163 | { .mmi | |
164 | add tmp = -1, linecnt // next loop count | |
165 | movi0 ar.lc = loopcnt | |
166 | ;; } | |
167 | .pref_l1b: | |
168 | { .mib | |
169 | stf.spill [ptr9] = f0, 128 // Do stores one cache line apart | |
170 | nop.i 0 | |
171 | br.cloop.dptk.few .pref_l1b | |
172 | ;; } | |
173 | { .mmi | |
174 | add ptr0 = 16, ptr2 // Two stores in parallel | |
175 | movi0 ar.lc = tmp | |
176 | ;; } | |
177 | .l1bx: | |
178 | { .mmi | |
179 | stf.spill [ptr2] = f0, 32 | |
180 | stf.spill [ptr0] = f0, 32 | |
181 | ;; } | |
182 | { .mmi | |
183 | stf.spill [ptr2] = f0, 32 | |
184 | stf.spill [ptr0] = f0, 32 | |
185 | ;; } | |
186 | { .mmi | |
187 | stf.spill [ptr2] = f0, 32 | |
188 | stf.spill [ptr0] = f0, 64 | |
c70a4b1d | 189 | cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? |
d5efd131 MF |
190 | ;; } |
191 | { .mmb | |
192 | stf.spill [ptr2] = f0, 32 | |
193 | (p_scr) stf.spill [ptr9] = f0, 128 | |
194 | br.cloop.dptk.few .l1bx | |
195 | ;; } | |
196 | { .mib | |
197 | cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? | |
198 | (p_scr) br.cond.dpnt.many .move_bytes_from_alignment | |
199 | ;; } | |
200 | ||
201 | .fraction_of_line: | |
202 | { .mib | |
203 | add ptr2 = 16, ptr1 | |
204 | shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 | |
205 | ;; } | |
206 | { .mib | |
207 | cmp.eq p_scr, p0 = loopcnt, r0 | |
208 | add loopcnt = -1, loopcnt | |
209 | (p_scr) br.cond.dpnt.many .store_words | |
210 | ;; } | |
211 | { .mib | |
212 | and cnt = 0x1f, cnt // compute the remaining cnt | |
213 | movi0 ar.lc = loopcnt | |
214 | ;; } | |
215 | .align 32 | |
216 | .l2: // -----------------------------// L2A: store 32B in 2 cycles | |
217 | { .mmb | |
218 | store [ptr1] = myval, 8 | |
219 | store [ptr2] = myval, 8 | |
220 | ;; } { .mmb | |
221 | store [ptr1] = myval, 24 | |
222 | store [ptr2] = myval, 24 | |
223 | br.cloop.dptk.many .l2 | |
224 | ;; } | |
225 | .store_words: | |
226 | { .mib | |
227 | cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? | |
228 | (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch | |
229 | ;; } | |
230 | ||
231 | { .mmi | |
232 | store [ptr1] = myval, 8 // store | |
233 | cmp.le p_y, p_n = 16, cnt // | |
234 | add cnt = -8, cnt // subtract | |
235 | ;; } | |
236 | { .mmi | |
237 | (p_y) store [ptr1] = myval, 8 // store | |
238 | (p_y) cmp.le.unc p_yy, p_nn = 16, cnt | |
239 | (p_y) add cnt = -8, cnt // subtract | |
240 | ;; } | |
241 | { .mmi // store | |
242 | (p_yy) store [ptr1] = myval, 8 | |
243 | (p_yy) add cnt = -8, cnt // subtract | |
244 | ;; } | |
245 | ||
246 | .move_bytes_from_alignment: | |
247 | { .mib | |
248 | cmp.eq p_scr, p0 = cnt, r0 | |
249 | tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? | |
250 | (p_scr) br.cond.dpnt.few .restore_and_exit | |
251 | ;; } | |
252 | { .mib | |
253 | (p_y) st4 [ptr1] = r0,4 | |
254 | tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? | |
255 | ;; } | |
256 | { .mib | |
257 | (p_yy) st2 [ptr1] = r0,2 | |
258 | tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? | |
259 | ;; } | |
260 | ||
261 | { .mib | |
262 | (p_y) st1 [ptr1] = r0 | |
263 | ;; } | |
264 | .restore_and_exit: | |
265 | { .mib | |
266 | nop.m 0 | |
267 | movi0 ar.lc = save_lc | |
268 | br.ret.sptk.many rp | |
269 | ;; } | |
270 | ||
271 | .move_bytes_unaligned: | |
272 | { .mmi | |
273 | .pred.rel "mutex",p_y, p_n | |
274 | .pred.rel "mutex",p_yy, p_nn | |
275 | (p_n) cmp.le p_yy, p_nn = 4, cnt | |
276 | (p_y) cmp.le p_yy, p_nn = 5, cnt | |
277 | (p_n) add ptr2 = 2, ptr1 | |
278 | } { .mmi | |
279 | (p_y) add ptr2 = 3, ptr1 | |
280 | (p_y) st1 [ptr1] = r0, 1 // fill 1 (odd-aligned) byte | |
281 | (p_y) add cnt = -1, cnt // [15, 14 (or less) left] | |
282 | ;; } | |
283 | { .mmi | |
284 | (p_yy) cmp.le.unc p_y, p0 = 8, cnt | |
285 | add ptr3 = ptr1, cnt // prepare last store | |
286 | movi0 ar.lc = save_lc | |
287 | } { .mmi | |
288 | (p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes | |
289 | (p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes | |
290 | (p_yy) add cnt = -4, cnt // [11, 10 (o less) left] | |
291 | ;; } | |
292 | { .mmi | |
293 | (p_y) cmp.le.unc p_yy, p0 = 8, cnt | |
294 | add ptr3 = -1, ptr3 // last store | |
295 | tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? | |
296 | } { .mmi | |
297 | (p_y) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes | |
298 | (p_y) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes | |
299 | (p_y) add cnt = -4, cnt // [7, 6 (or less) left] | |
300 | ;; } | |
301 | { .mmi | |
302 | (p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes | |
303 | (p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes | |
304 | // [3, 2 (or less) left] | |
305 | tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? | |
306 | } { .mmi | |
307 | (p_yy) add cnt = -4, cnt | |
308 | ;; } | |
309 | { .mmb | |
310 | (p_scr) st2 [ptr1] = r0 // fill 2 (aligned) bytes | |
311 | (p_y) st1 [ptr3] = r0 // fill last byte (using ptr3) | |
312 | br.ret.sptk.many rp | |
313 | ;; } | |
314 | END(bzero) |