]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/memset.S
04a60257d4f455d9881c568db315dc4f0037b954
[thirdparty/glibc.git] / sysdeps / ia64 / memset.S
1 /* Optimized version of the standard memset() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2014 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21 /* Return: dest
22
23 Inputs:
24 in0: dest
25 in1: value
26 in2: count
27
28 The algorithm is fairly straightforward: set byte by byte until we
29 we get to a 16B-aligned address, then loop on 128 B chunks using an
30 early store as prefetching, then loop on 32B chucks, then clear remaining
31 words, finally clear remaining bytes.
32 Since a stf.spill f0 can store 16B in one go, we use this instruction
33 to get peak speed when value = 0. */
34
35 #include <sysdep.h>
36 #undef ret
37
38 #define dest in0
39 #define value in1
40 #define cnt in2
41
42 #define tmp r31
43 #define save_lc r30
44 #define ptr0 r29
45 #define ptr1 r28
46 #define ptr2 r27
47 #define ptr3 r26
48 #define ptr9 r24
49 #define loopcnt r23
50 #define linecnt r22
51 #define bytecnt r21
52
53 #define fvalue f6
54
55 // This routine uses only scratch predicate registers (p6 - p15)
56 #define p_scr p6 // default register for same-cycle branches
57 #define p_nz p7
58 #define p_zr p8
59 #define p_unalgn p9
60 #define p_y p11
61 #define p_n p12
62 #define p_yy p13
63 #define p_nn p14
64
65 #define movi0 mov
66
67 #define MIN1 15
68 #define MIN1P1HALF 8
69 #define LINE_SIZE 128
70 #define LSIZE_SH 7 // shift amount
71 #define PREF_AHEAD 8
72
73 #define USE_FLP
74 #if defined(USE_INT)
75 #define store st8
76 #define myval value
77 #elif defined(USE_FLP)
78 #define store stf8
79 #define myval fvalue
80 #endif
81
82 .align 64
83 ENTRY(memset)
84 { .mmi
85 .prologue
86 alloc tmp = ar.pfs, 3, 0, 0, 0
87 lfetch.nt1 [dest]
88 .save ar.lc, save_lc
89 movi0 save_lc = ar.lc
90 } { .mmi
91 .body
92 mov ret0 = dest // return value
93 cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
94 cmp.eq p_scr, p0 = cnt, r0
95 ;; }
96 { .mmi
97 and ptr2 = -(MIN1+1), dest // aligned address
98 and tmp = MIN1, dest // prepare to check for alignment
99 tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
100 } { .mib
101 mov ptr1 = dest
102 mux1 value = value, @brcst // create 8 identical bytes in word
103 (p_scr) br.ret.dpnt.many rp // return immediately if count = 0
104 ;; }
105 { .mib
106 cmp.ne p_unalgn, p0 = tmp, r0
107 } { .mib // NB: # of bytes to move is 1 higher
108 sub bytecnt = (MIN1+1), tmp // than loopcnt
109 cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
110 (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
111 ;; }
112 { .mmi
113 (p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
114 (p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
115 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
116 ;; }
117 { .mib
118 (p_y) add cnt = -8, cnt
119 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
120 } { .mib
121 (p_y) st8 [ptr2] = value, -4
122 (p_n) add ptr2 = 4, ptr2
123 ;; }
124 { .mib
125 (p_yy) add cnt = -4, cnt
126 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
127 } { .mib
128 (p_yy) st4 [ptr2] = value, -2
129 (p_nn) add ptr2 = 2, ptr2
130 ;; }
131 { .mmi
132 mov tmp = LINE_SIZE+1 // for compare
133 (p_y) add cnt = -2, cnt
134 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
135 } { .mmi
136 setf.sig fvalue=value // transfer value to FLP side
137 (p_y) st2 [ptr2] = value, -1
138 (p_n) add ptr2 = 1, ptr2
139 ;; }
140
141 { .mmi
142 (p_yy) st1 [ptr2] = value
143 cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
144 } { .mbb
145 (p_yy) add cnt = -1, cnt
146 (p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
147 ;; }
148
149 { .mib
150 nop.m 0
151 shr.u linecnt = cnt, LSIZE_SH
152 (p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
153 ;; }
154
155 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
156 .align 32 // -------- // L1A: store ahead into cache lines; fill later
157 #endif
158 { .mmi
159 and tmp = -(LINE_SIZE), cnt // compute end of range
160 mov ptr9 = ptr1 // used for prefetching
161 and cnt = (LINE_SIZE-1), cnt // remainder
162 } { .mmi
163 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
164 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
165 ;; }
166 { .mmi
167 (p_scr) add loopcnt = -1, linecnt // start of stores
168 add ptr2 = 8, ptr1 // (beyond prefetch stores)
169 add ptr1 = tmp, ptr1 // first address beyond total
170 ;; } // range
171 { .mmi
172 add tmp = -1, linecnt // next loop count
173 movi0 ar.lc = loopcnt
174 ;; }
175 .pref_l1a:
176 { .mib
177 store [ptr9] = myval, 128 // Do stores one cache line apart
178 nop.i 0
179 br.cloop.dptk.few .pref_l1a
180 ;; }
181 { .mmi
182 add ptr0 = 16, ptr2 // Two stores in parallel
183 movi0 ar.lc = tmp
184 ;; }
185 .l1ax:
186 { .mmi
187 store [ptr2] = myval, 8
188 store [ptr0] = myval, 8
189 ;; }
190 { .mmi
191 store [ptr2] = myval, 24
192 store [ptr0] = myval, 24
193 ;; }
194 { .mmi
195 store [ptr2] = myval, 8
196 store [ptr0] = myval, 8
197 ;; }
198 { .mmi
199 store [ptr2] = myval, 24
200 store [ptr0] = myval, 24
201 ;; }
202 { .mmi
203 store [ptr2] = myval, 8
204 store [ptr0] = myval, 8
205 ;; }
206 { .mmi
207 store [ptr2] = myval, 24
208 store [ptr0] = myval, 24
209 ;; }
210 { .mmi
211 store [ptr2] = myval, 8
212 store [ptr0] = myval, 32
213 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
214 ;; }
215 { .mmb
216 store [ptr2] = myval, 24
217 (p_scr) store [ptr9] = myval, 128
218 br.cloop.dptk.few .l1ax
219 ;; }
220 { .mbb
221 cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
222 (p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
223 br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
224 ;; }
225
226 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
227 { nop 0 }
228 #else
229 .align 32
230 #endif
231 .l1b: // ------------------ // L1B: store ahead into cache lines; fill later
232 { .mmi
233 and tmp = -(LINE_SIZE), cnt // compute end of range
234 mov ptr9 = ptr1 // used for prefetching
235 and cnt = (LINE_SIZE-1), cnt // remainder
236 } { .mmi
237 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
238 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
239 ;; }
240 { .mmi
241 (p_scr) add loopcnt = -1, linecnt
242 add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
243 add ptr1 = tmp, ptr1 // first address beyond total range
244 ;; }
245 { .mmi
246 add tmp = -1, linecnt // next loop count
247 movi0 ar.lc = loopcnt
248 ;; }
249 .pref_l1b:
250 { .mib
251 stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
252 nop.i 0
253 br.cloop.dptk.few .pref_l1b
254 ;; }
255 { .mmi
256 add ptr0 = 16, ptr2 // Two stores in parallel
257 movi0 ar.lc = tmp
258 ;; }
259 .l1bx:
260 { .mmi
261 stf.spill [ptr2] = f0, 32
262 stf.spill [ptr0] = f0, 32
263 ;; }
264 { .mmi
265 stf.spill [ptr2] = f0, 32
266 stf.spill [ptr0] = f0, 32
267 ;; }
268 { .mmi
269 stf.spill [ptr2] = f0, 32
270 stf.spill [ptr0] = f0, 64
271 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
272 ;; }
273 { .mmb
274 stf.spill [ptr2] = f0, 32
275 (p_scr) stf.spill [ptr9] = f0, 128
276 br.cloop.dptk.few .l1bx
277 ;; }
278 { .mib
279 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
280 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment
281 ;; }
282
283 .fraction_of_line:
284 { .mib
285 add ptr2 = 16, ptr1
286 shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
287 ;; }
288 { .mib
289 cmp.eq p_scr, p0 = loopcnt, r0
290 add loopcnt = -1, loopcnt
291 (p_scr) br.cond.dpnt.many store_words
292 ;; }
293 { .mib
294 and cnt = 0x1f, cnt // compute the remaining cnt
295 movi0 ar.lc = loopcnt
296 ;; }
297 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
298 .align 32
299 #endif
300 .l2: // ---------------------------- // L2A: store 32B in 2 cycles
301 { .mmb
302 store [ptr1] = myval, 8
303 store [ptr2] = myval, 8
304 ;; } { .mmb
305 store [ptr1] = myval, 24
306 store [ptr2] = myval, 24
307 br.cloop.dptk.many .l2
308 ;; }
309 store_words:
310 { .mib
311 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
312 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
313 ;; }
314
315 { .mmi
316 store [ptr1] = myval, 8 // store
317 cmp.le p_y, p_n = 16, cnt //
318 add cnt = -8, cnt // subtract
319 ;; }
320 { .mmi
321 (p_y) store [ptr1] = myval, 8 // store
322 (p_y) cmp.le.unc p_yy, p_nn = 16, cnt //
323 (p_y) add cnt = -8, cnt // subtract
324 ;; }
325 { .mmi // store
326 (p_yy) store [ptr1] = myval, 8 //
327 (p_yy) add cnt = -8, cnt // subtract
328 ;; }
329
330 .move_bytes_from_alignment:
331 { .mib
332 cmp.eq p_scr, p0 = cnt, r0
333 tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
334 (p_scr) br.cond.dpnt.few .restore_and_exit
335 ;; }
336 { .mib
337 (p_y) st4 [ptr1] = value, 4
338 tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
339 ;; }
340 { .mib
341 (p_yy) st2 [ptr1] = value, 2
342 tbit.nz.unc p_y, p0 = cnt, 0
343 ;; }
344
345 { .mib
346 (p_y) st1 [ptr1] = value
347 ;; }
348 .restore_and_exit:
349 { .mib
350 nop.m 0
351 movi0 ar.lc = save_lc
352 br.ret.sptk.many rp
353 ;; }
354
355 .move_bytes_unaligned:
356 { .mmi
357 .pred.rel "mutex",p_y, p_n
358 .pred.rel "mutex",p_yy, p_nn
359 (p_n) cmp.le p_yy, p_nn = 4, cnt
360 (p_y) cmp.le p_yy, p_nn = 5, cnt
361 (p_n) add ptr2 = 2, ptr1
362 } { .mmi
363 (p_y) add ptr2 = 3, ptr1
364 (p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte
365 (p_y) add cnt = -1, cnt // [15, 14 (or less) left]
366 ;; }
367 { .mmi
368 (p_yy) cmp.le.unc p_y, p0 = 8, cnt
369 add ptr3 = ptr1, cnt // prepare last store
370 movi0 ar.lc = save_lc
371 } { .mmi
372 (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
373 (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
374 (p_yy) add cnt = -4, cnt // [11, 10 (o less) left]
375 ;; }
376 { .mmi
377 (p_y) cmp.le.unc p_yy, p0 = 8, cnt
378 add ptr3 = -1, ptr3 // last store
379 tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
380 } { .mmi
381 (p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
382 (p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
383 (p_y) add cnt = -4, cnt // [7, 6 (or less) left]
384 ;; }
385 { .mmi
386 (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
387 (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
388 // [3, 2 (or less) left]
389 tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
390 } { .mmi
391 (p_yy) add cnt = -4, cnt
392 ;; }
393 { .mmb
394 (p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
395 (p_y) st1 [ptr3] = value // fill last byte (using ptr3)
396 br.ret.sptk.many rp
397 ;; }
398 END(memset)
399 libc_hidden_builtin_def (memset)